# Setup

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import scipy.sparse as sp
import contractions
from sklearn.feature_extraction.text import CountVectorizer

# Load in Data

In [119]:
df = pd.read_csv('train.csv', skiprows=0)
df2 = pd.read_csv('test.csv', skiprows=0)

# Clean Up Data

In [120]:
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

contractions = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "I would",
    "i'll": "I will",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "its" : "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have",
    "'s": "is",
    "'re": " are",
    "'ve": " have",
    "'m": " am",
    "'d": " would",
}

stop_words = set(stopwords.words('english'))
stem = SnowballStemmer('english')

def decontract(tkn):
    for word in tkn.split():
        if word.lower() in contractions:
            tkn = tkn.replace(word, contractions[word.lower()])
    return tkn

def tokenize(sentence):
    sent = nltk.sent_tokenize(sentence)
    sent = [word.lower() for word in sent]
    sent = [re.sub('[^a-zA-Z]', ' ', word) for word in sent]
    sent = [decontract(word) for word in sent]
    tokens = []
    for word in sent:
        words = nltk.word_tokenize(word)
        words = [w for w in words if not w in stop_words]
        words = [stem.stem(w) for w in words if not w in stop_words]
        tokens.extend(words)
    return tokens

df['Text'] = df['Text'].apply(tokenize)
df2['Text'] = df2['Text'].apply(tokenize)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Linguistic Feature Extraction

In [128]:
arr = []
for wrd in df['Text']:
    arr.append(' '.join(wrd))
for wrd in df2['Text']:
    arr.append(' '.join(wrd))

# fit the CountVectorizer on the entire corpus of text
count_vectorizer = CountVectorizer(tokenizer=tokenize, max_df=0.5, min_df=2)
bag = count_vectorizer.fit_transform(arr)

# transform each dataset separately
bag_df = count_vectorizer.transform([' '.join(wrd) for wrd in df['Text']])
bag_df2 = count_vectorizer.transform([' '.join(wrd) for wrd in df2['Text']])

# Classification Model

In [129]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [136]:
testY = df2['Sentiment']
mod = LogisticRegression(max_iter = 1100000)
trainY = df['Sentiment']
mod.fit(bag_df,trainY)
predicted = mod.predict(bag_df2)

In [137]:
print(classification_report(predicted, testY))

              precision    recall  f1-score   support

           0       0.91      0.66      0.76       245
           1       0.54      0.86      0.66       114

    accuracy                           0.72       359
   macro avg       0.72      0.76      0.71       359
weighted avg       0.79      0.72      0.73       359

