# Sentiment Analysis

## Imports

In [1]:
import nltk
import string
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download(
    ["punkt", "stopwords"]
)

[nltk_data] Downloading package punkt to /home/oak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/oak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Data

In [3]:
df = pd.read_csv("data/raw.csv")

In [4]:
df.head()

Unnamed: 0,reviewText,Sentiment
0,This is a one of the best apps acording to a b...,Positive
1,This is a pretty good version of the game for ...,Positive
2,this is a really cool game. there are a bunch ...,Positive
3,"This is a silly game and can be frustrating, b...",Positive
4,This is a terrific game on any pad. Hrs of fun...,Positive


## Data Preprocessing

### Numerical Normalization

In [5]:
df.loc[df["Sentiment"] == "Positive", "Sentiment"] = 1
df.loc[df["Sentiment"] == "Negative", "Sentiment"] = 0

In [6]:
df.sample(10, random_state = 1)

Unnamed: 0,reviewText,Sentiment
11456,"I hav nt found a real use for this app,perhaps...",0
16528,Either that or we just could not figure out ho...,0
3253,had to remove it because it confused me. The d...,0
18614,Agree with the other reviewers - unresponsive ...,0
1544,I love this app! Not only because I can now fo...,1
12568,I wanted something like alt/tab on my android ...,1
15497,"I am loving this app! UI is simple, functional...",1
13987,"We have downloaded several drawing apps, but t...",1
9598,"Currently, I am using it to keep my favorite g...",1
6668,this is a very useful app. we have used it suc...,1


### Splitting Data

In [7]:
x = df["reviewText"]
y = df["Sentiment"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size = 0.2,
    shuffle = True,
    random_state = 1
)

### Sentence Normalization

In [8]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words("english")

In [9]:
def normalize (sentence):
    return " ".join(
        [
            stemmer.stem(token)
            for token in nltk.word_tokenize(sentence) if token not in stopwords and not set(token).issubset(string.punctuation + string.digits)
        ]
    )

In [10]:
x_train_cleaned = [
    normalize(sentence) for sentence in x_train
]

In [11]:
x_test_cleaned = [
    normalize(sentence) for sentence in x_test
]

### Sentence Vectorization

In [12]:
vectorizer = TfidfVectorizer(
    strip_accents = "unicode",
    analyzer = "word",
    ngram_range = (1, 2)
)
x_train_tfidf = vectorizer.fit_transform(x_train_cleaned)

In [13]:
x_test_tfidf = vectorizer.transform(x_test_cleaned)

In [14]:
pd.DataFrame.sparse.from_spmatrix(x_train_tfidf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160868,160869,160870,160871,160872,160873,160874,160875,160876,160877
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Models

### Naive Bayes

#### Construction

#### Training

#### Evaluation

### Logistic Regression

#### Construction

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(
    random_state = 1,
    verbose = 1
)

#### Training

In [16]:
clf = lr.fit(
    x_train_tfidf,
    y_train.astype(int)
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =       160879     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.10904D+04    |proj g|=  4.18000D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****     46     54      1     0     0   1.924D-03   5.181D+03
  F =   5180.7261588282008     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s finished


#### Evaluation

In [17]:
y_pred = clf.predict(x_test_tfidf)

In [18]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

              precision    recall  f1-score   support

    Negative       0.88      0.63      0.73       947
    Positive       0.89      0.97      0.93      3053

    accuracy                           0.89      4000
   macro avg       0.89      0.80      0.83      4000
weighted avg       0.89      0.89      0.89      4000



### K-Nearest Neighbors (KNN)

#### Construction

#### Training

#### Evaluation

### Support Vector Machine (SVM)

#### Construction

In [19]:
from sklearn import svm
svc = svm.SVC(
    random_state = 1,
    verbose = True
)

#### Training

In [20]:
clf = svc.fit(
    x_train_tfidf,
    y_train.astype(int)
)

[LibSVM]...........*....*
optimization finished, #iter = 15034
obj = -3590.095908, rho = 0.847704
nSV = 11209, nBSV = 2696
Total nSV = 11209


#### Evaluation

In [21]:
y_pred = clf.predict(x_test_tfidf)

In [22]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

              precision    recall  f1-score   support

    Negative       0.88      0.67      0.76       947
    Positive       0.90      0.97      0.94      3053

    accuracy                           0.90      4000
   macro avg       0.89      0.82      0.85      4000
weighted avg       0.90      0.90      0.90      4000



### Decision Tree

#### Construction

#### Training

#### Evaluation

### Random Forest

#### Construction

#### Training

#### Evaluation

### Artificial Neural Network (ANN)

#### Construction

#### Training

#### Evaluation

## Report