# Sentiment Analysis

## Imports

In [None]:
import nltk
import string
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
nltk.download(
    ["punkt", "stopwords"]
)

## Loading Data

In [None]:
df = pd.read_csv("data/raw.csv")

In [None]:
df.head()

In [None]:
plt.figure(figsize = (7, 5))
df["Sentiment"].value_counts().plot.bar()
plt.title("Sentiment")
plt.xlabel("")
plt.ylabel("Count")
plt.xticks(rotation = 0)
plt.show()

In [None]:
plt.figure(figsize = (7, 5))
df["length"] = df["reviewText"].apply(lambda x: len(x))
df["length"].hist(bins = 50)
plt.title("Histogram of Review Lengths")
plt.xlabel("Review Length")
plt.ylabel("Frequency")
plt.show()

## Data Preprocessing

### Numerical Normalization

In [None]:
df.loc[df["Sentiment"] == "Positive", "Sentiment"] = 1
df.loc[df["Sentiment"] == "Negative", "Sentiment"] = 0

In [None]:
df.sample(10, random_state = 1)

### Splitting Data

In [None]:
x = df["reviewText"]
y = df["Sentiment"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size = 0.2,
    shuffle = True,
    random_state = 1
)

### Sentence Normalization

In [None]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
def normalize (sentence):
    return " ".join(
        [
            stemmer.stem(token)
            for token in nltk.word_tokenize(sentence) if token not in stopwords and not set(token).issubset(string.punctuation + string.digits)
        ]
    )

In [None]:
x_train_cleaned = [
    normalize(sentence) for sentence in x_train
]

In [None]:
x_test_cleaned = [
    normalize(sentence) for sentence in x_test
]

### Sentence Vectorization

#### TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer(
    strip_accents = "unicode",
    analyzer = "word",
    ngram_range = (1, 2)
)
x_train_tfidf = vectorizer.fit_transform(x_train_cleaned)

In [None]:
x_test_tfidf = vectorizer.transform(x_test_cleaned)

In [None]:
pd.DataFrame.sparse.from_spmatrix(x_train_tfidf)

#### Count Vectorization

In [None]:
vectorizer = CountVectorizer(
    strip_accents = "unicode",
    analyzer = "word",
    ngram_range = (1, 2)
)
x_train_count = vectorizer.fit_transform(x_train_cleaned)

In [None]:
x_test_count = vectorizer.transform(x_test_cleaned)

In [None]:
pd.DataFrame.sparse.from_spmatrix(x_train_count)

## Models

### Multinomial Naive Bayes

#### Construction

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(
    force_alpha = True
)

#### Training

In [None]:
clf = mnb.fit(
    x_train_count,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = clf.predict(x_test_count)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### Logistic Regression

#### Construction

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(
    random_state = 1,
    verbose = 1
)

#### Training

In [None]:
clf = lr.fit(
    x_train_tfidf,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = clf.predict(x_test_tfidf)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### K-Nearest Neighbors (KNN)

#### Construction

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
    n_neighbors = 20
)

#### Training

In [None]:
clf = knn.fit(
    x_train_tfidf,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = knn.predict(x_test_tfidf)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### Support Vector Machine (SVM)

#### Construction

In [None]:
from sklearn import svm
svc = svm.SVC(
    random_state = 1,
    verbose = True
)

#### Training

In [None]:
clf = svc.fit(
    x_train_tfidf,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = clf.predict(x_test_tfidf)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### Decision Tree

#### Construction

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier(
    random_state = 1
)

#### Training

In [None]:
clf = dtc.fit(
    x_train_tfidf,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = clf.predict(x_test_tfidf)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### Random Forest

#### Construction

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(
    random_state = 1,
    verbose = 1
)

#### Training

In [None]:
clf = rfc.fit(
    x_train_tfidf,
    y_train.astype(int)
)

#### Evaluation

In [None]:
y_pred = rfc.predict(x_test_tfidf)

In [None]:
print(
    classification_report(
        y_test.astype(int),
        y_pred,
        target_names = ["Negative", "Positive"]
    )
)

### Artificial Neural Network (ANN)

#### Construction

#### Training

#### Evaluation

## Report