In [41]:
import pandas as pd

In [42]:
df= pd.read_csv("demo.csv")

In [43]:
df.shape

(100000, 2)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Feedback   100000 non-null  object
 1   Sentiment  100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [45]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl

In [46]:
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def remove_puc(text):
    for ele in text:
        if ele in punc:
            text = text.replace(ele, "")
    return text

In [47]:
df["punctuation_removed"] = df["Feedback"].apply(lambda x: remove_puc(x))

In [48]:
df.head()

Unnamed: 0,Feedback,Sentiment,punctuation_removed
0,Of course Oliver Stone pulls out all the stops...,Positive,Of course Oliver Stone pulls out all the stops...
1,Bills Can Crusher,Positive,Bills Can Crusher
2,Product received with a chunk broken off of th...,Positive,Product received with a chunk broken off of th...
3,Don't waste your money buying these jars!!!! T...,Negative,Dont waste your money buying these jars The id...
4,This Game Rocks! Buy It I Got It Today And I L...,Positive,This Game Rocks Buy It I Got It Today And I Lo...


In [49]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [50]:
from nltk.tokenize import sent_tokenize

df["nltk_token"] = df["punctuation_removed"].apply(lambda x: word_tokenize(x.lower()))

In [51]:
df["StopWords_Removed"] = df['nltk_token'].apply(lambda x: [item for item in x if item not in stopwords.words('english')])

In [52]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost, textblob, string
import numpy as np
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [53]:
classes = ['Negative', 'Positive']

In [54]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['punctuation_removed'], df['Sentiment'])
# Data is divided into 75% training and 25% testing by default
# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [66]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['punctuation_removed'])
# transform the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [56]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features=5000)
tfidf_vect.fit(df['punctuation_removed'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

In [73]:
def train_model(classifier, vector_train, label, vector_valid):
    classifier.fit(vector_train, label)
    predictions = classifier.predict(vector_valid)
    filename = 'tip.sav'
    pickle.dump(classifier, open(filename, 'wb'))
    return classification_report(predictions, valid_y ,target_names=classes)

In [74]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y,xvalid_count)
print( "LR, Count Vectors: \n", accuracy)
print("------------------------------------------------")

LR, Count Vectors: 
               precision    recall  f1-score   support

    Negative       0.82      0.84      0.83     12242
    Positive       0.84      0.82      0.83     12758

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

------------------------------------------------


In [63]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y,xvalid_tfidf)
print( "LR, WordLevel TF-IDF: \n", accuracy)
print("------------------------------------------------")

LR, WordLevel TF-IDF: 
               precision    recall  f1-score   support

    Negative       0.84      0.83      0.83     12671
    Positive       0.83      0.84      0.83     12329

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

------------------------------------------------


In [60]:
import pickle

In [61]:
filename = 'finalized_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

NameError: name 'classifier' is not defined

In [64]:
df.head()

Unnamed: 0,Feedback,Sentiment,punctuation_removed,nltk_token,StopWords_Removed
0,Of course Oliver Stone pulls out all the stops...,Positive,Of course Oliver Stone pulls out all the stops...,"[of, course, oliver, stone, pulls, out, all, t...","[course, oliver, stone, pulls, stops, wild, de..."
1,Bills Can Crusher,Positive,Bills Can Crusher,"[bills, can, crusher]","[bills, crusher]"
2,Product received with a chunk broken off of th...,Positive,Product received with a chunk broken off of th...,"[product, received, with, a, chunk, broken, of...","[product, received, chunk, broken, griddle, ed..."
3,Don't waste your money buying these jars!!!! T...,Negative,Dont waste your money buying these jars The id...,"[dont, waste, your, money, buying, these, jars...","[dont, waste, money, buying, jars, idea, great..."
4,This Game Rocks! Buy It I Got It Today And I L...,Positive,This Game Rocks Buy It I Got It Today And I Lo...,"[this, game, rocks, buy, it, i, got, it, today...","[game, rocks, buy, got, today, love, way, bett..."


In [67]:
abc=df[['punctuation_removed']]

In [68]:
abc.to_csv('abc.csv',index=False)

In [69]:
count_vect

CountVectorizer(token_pattern='\\w{1,}')

In [70]:
pickle.dump(count_vect.vocabulary_,open("feature.pkl","wb"))

In [72]:
import joblib

# example for saving python object as pkl
joblib.dump(count_vect, "vectorizer.pkl")

# loading pickled vectorizer
vectorizer = joblib.load("vectorizer.pkl")