In [1]:
import pandas as pd

In [2]:
df= pd.read_csv("demo.csv")

In [3]:
df.shape

(100000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Feedback   100000 non-null  object
 1   Sentiment  100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [5]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl

In [6]:
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def remove_puc(text):
    for ele in text:
        if ele in punc:
            text = text.replace(ele, "")
    return text

In [7]:
df["punctuation_removed"] = df["Feedback"].apply(lambda x: remove_puc(x))

In [8]:
df.head()

Unnamed: 0,Feedback,Sentiment,punctuation_removed
0,Of course Oliver Stone pulls out all the stops...,Positive,Of course Oliver Stone pulls out all the stops...
1,Bills Can Crusher,Positive,Bills Can Crusher
2,Product received with a chunk broken off of th...,Positive,Product received with a chunk broken off of th...
3,Don't waste your money buying these jars!!!! T...,Negative,Dont waste your money buying these jars The id...
4,This Game Rocks! Buy It I Got It Today And I L...,Positive,This Game Rocks Buy It I Got It Today And I Lo...


In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [11]:
from nltk.tokenize import sent_tokenize

df["nltk_token"] = df["punctuation_removed"].apply(lambda x: word_tokenize(x.lower()))

In [12]:
df["StopWords_Removed"] = df['nltk_token'].apply(lambda x: [item for item in x if item not in stopwords.words('english')])

In [13]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost, textblob, string
import numpy as np
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [14]:
classes = ['Negative', 'Positive']

In [15]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['punctuation_removed'], df['Sentiment'])
# Data is divided into 75% training and 25% testing by default
# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [16]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['punctuation_removed'])
# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [17]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features=5000)
tfidf_vect.fit(df['punctuation_removed'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

In [18]:
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['punctuation_removed'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)

In [19]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df['punctuation_removed'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)

In [33]:
def train_model(classifier, vector_train, label, vector_valid):
    classifier.fit(vector_train, label)
    predictions = classifier.predict(vector_valid)
    filename = 'fuckkkk.sav'
    pickle.dump(classifier, open(filename, 'wb'))
    return classification_report(predictions, valid_y ,target_names=classes)

In [21]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y,xvalid_count)
print ("NB, Count Vectors: \n", accuracy)
print("------------------------------------------------")

NB, Count Vectors: 
               precision    recall  f1-score   support

    Negative       0.82      0.80      0.81     12936
    Positive       0.79      0.81      0.80     12064

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

------------------------------------------------


In [22]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y,xvalid_tfidf)
print ("NB, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

NB, WordLevel TF-IDF: 
               precision    recall  f1-score   support

    Negative       0.82      0.81      0.81     12734
    Positive       0.80      0.81      0.81     12266

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

------------------------------------------------


In [23]:
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y,xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: \n", accuracy)
print("------------------------------------------------")

NB, N-Gram Vectors: 
               precision    recall  f1-score   support

    Negative       0.85      0.69      0.76     15352
    Positive       0.62      0.80      0.70      9648

    accuracy                           0.73     25000
   macro avg       0.73      0.75      0.73     25000
weighted avg       0.76      0.73      0.74     25000

------------------------------------------------


In [24]:
# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: \n", accuracy)
print("------------------------------------------------")

NB, CharLevel Vectors: 
               precision    recall  f1-score   support

    Negative       0.76      0.78      0.77     12318
    Positive       0.78      0.76      0.77     12682

    accuracy                           0.77     25000
   macro avg       0.77      0.77      0.77     25000
weighted avg       0.77      0.77      0.77     25000

------------------------------------------------


In [25]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y,xvalid_count)
print( "LR, Count Vectors: \n", accuracy)
print("------------------------------------------------")

LR, Count Vectors: 
               precision    recall  f1-score   support

    Negative       0.81      0.84      0.82     12072
    Positive       0.84      0.81      0.83     12928

    accuracy                           0.82     25000
   macro avg       0.82      0.83      0.82     25000
weighted avg       0.83      0.82      0.82     25000

------------------------------------------------


In [26]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y,xvalid_tfidf)
print( "LR, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

LR, WordLevel TF-IDF: 
               precision    recall  f1-score   support

    Negative       0.83      0.83      0.83     12564
    Positive       0.83      0.83      0.83     12436

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

------------------------------------------------


In [27]:
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram)
print( "LR, N-Gram Vectors: \n", accuracy)
print("------------------------------------------------")

LR, N-Gram Vectors: 
               precision    recall  f1-score   support

    Negative       0.62      0.83      0.71      9409
    Positive       0.87      0.70      0.78     15591

    accuracy                           0.75     25000
   macro avg       0.75      0.76      0.74     25000
weighted avg       0.78      0.75      0.75     25000

------------------------------------------------


In [28]:
# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print( "LR, CharLevel Vectors: \n", accuracy)
print("------------------------------------------------")

LR, CharLevel Vectors: 
               precision    recall  f1-score   support

    Negative       0.81      0.82      0.81     12406
    Positive       0.82      0.81      0.81     12594

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

------------------------------------------------


In [29]:
# SVM on Count Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print( "SVM, Count Vectors: \n", accuracy)
print("------------------------------------------------")

SVM, Count Vectors: 
               precision    recall  f1-score   support

    Negative       0.80      0.84      0.82     11897
    Positive       0.85      0.80      0.83     13103

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

------------------------------------------------


In [34]:
# SVM on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print( "SVM, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

NameError: name 'pickle' is not defined

In [31]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y,xvalid_tfidf_ngram)
print( "SVM, N-Gram Vectors: \n", accuracy)
print("------------------------------------------------")

KeyboardInterrupt: 

In [None]:
# SVM on Character Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y,xvalid_tfidf_ngram_chars)
print( "SVM, CharLevel Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y,xvalid_count)
print( "RF, Count Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y,xvalid_tfidf)
print( "RF, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

In [None]:
# RF on Ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram)
print( "RF, N-Gram Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# RF on Character Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(),xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print( "RF, CharLevel Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# Extreme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y,xvalid_count.tocsc())
print ("Xgb, Count Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# Extreme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y,xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

In [None]:
# Extreme Gradient Boosting on Ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(),train_y, xvalid_tfidf_ngram)
print( "Xgb, N-Gram Vectors: \n", accuracy)
print("------------------------------------------------")

In [None]:
# Extreme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: \n", accuracy)
print("------------------------------------------------")