# Chapter 4: Unigrams and n-grams

In [1]:
import configparser
from getpass import getuser
config = configparser.ConfigParser()
config.read(f"../config/{getuser()}.ini")
if len(config.sections()) == 0:
    config.read("config/default.ini")
DATA_PATH = config["Data"]["path"]

import os
import pandas as pd
df = pd.read_pickle(os.path.join(DATA_PATH, "df.pkl"))
print(df.shape)

(7131, 11)


## Data Cleaning

In [2]:
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

sw = stopwords.words("english")
wnl = WordNetLemmatizer()
df["Title"] = df["Title"].apply(lambda x: " ".join([wnl.lemmatize(w.lower()) for w in x.split() if w not in sw]))
df.dropna(subset=["Title"], inplace=True)
display(df.head())

X = df["Title"]
y = df["Sentiment"]

Unnamed: 0,Title,URL,Time_Published,Body_Text,Company,Symbol,Positive,Negative,Polarity,News_Id,Sentiment
0,trump move military hospital precautionary mea...,http://www.aastocks.com/en/stocks/news/aafn-co...,2020-10-03 08:13:00,US President Donald Trump moved to a military ...,,,44,38,0.073171,NOW.1046693,0
1,"hkadr project hsi open up 352 pt 23,812 next mon",http://www.aastocks.com/en/stocks/news/aafn-co...,2020-10-03 08:30:00,ADR Code│Relative price (HKD)│Premium of HK st...,ADR Code│Relative price,HKD,8,4,0.333333,NOW.1046694,1
2,"uk, french mkts rise hope more relief measure",http://www.aastocks.com/en/stocks/news/aafn-co...,2020-10-03 08:19:00,UK and French bourses clawed back as Washingto...,,,2,0,1.0,NOW.1046695,1
3,djia narrow loss 134 pt after trump contract c...,http://www.aastocks.com/en/stocks/news/aafn-co...,2020-10-03 08:26:00,DJIA futures once collapsed over 500 pts after...,,,3,1,0.5,NOW.1046696,1
4,vix index up 3.5%,http://www.aastocks.com/en/stocks/news/aafn-co...,2020-10-03 08:27:00,"VIX Index rose 0.93 pts, or 3.5%, to 27.63.",,,3,1,0.5,NOW.1046697,1


## Fitting and evaluating models for unigram to trigram

### SVM

In [3]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

for n in [1, 2, 3]:
    max_accuracy = max_accuracy_c = 0
    for c in np.arange(0.005, 1, 0.2).tolist() + np.arange(1, 50, 4).tolist():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)
        count_vectorizer = CountVectorizer(ngram_range=(n, n))
        count_vectorizer.fit(X_train)
        X_train = count_vectorizer.transform(X_train)
        X_test = count_vectorizer.transform(X_test)
        
        model = SVC(C=c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_accuracy_c = c
    print(f"{n}-gram:")
    print("\tMax accuracy:", max_accuracy, f"\n\tC={max_accuracy_c}")

1-gram:
	Max accuracy: 0.7479967948717948 
	C=0.805
2-gram:
	Max accuracy: 0.7504006410256411 
	C=0.6050000000000001
3-gram:
	Max accuracy: 0.7411858974358975 
	C=25


Still far from satisfactory (although much better than the lexicon based methods)