In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy
import re,string,unicodedata
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import fasttext
from sklearn.metrics import confusion_matrix

In [2]:
# nltk.download('punkt')   # one time execution
# nltk.download('stopwords')  # one time execution
#nltk.download('wordnet') # one time execution

In [3]:
df = pd.read_csv('Data.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
df.duplicated().any()
df = df.drop_duplicates()

In [6]:
df.describe()

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,Spoilers I guess.<br /><br /> The absolutely a...,positive
freq,1,24884


In [7]:
df.value_counts('sentiment')

sentiment
positive    24884
negative    24698
dtype: int64

In [8]:
def cleaninTxt(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

In [9]:
df['review']=df['review'].apply(cleaninTxt)

In [10]:
def stemTxt(text):
    lemmatizer = WordNetLemmatizer() 
    
    newtxt = ''
    for i in text.split():
        if i not in stopwords.words('english'):
            newtxt+=lemmatizer.lemmatize(i)+' '
    
    return newtxt

In [11]:
## Kelimelerin köklerini bulma ve stopwordleri temizleme örneği
a = df['review'].get(1)
print(a,'\n\n\n')

a = stemTxt(a)
print(a)

a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done  



wonderful little production filmi

In [12]:
df['review']=df['review'].apply(stemTxt)


In [13]:
df.to_csv("processed.csv")

In [14]:
df.head(10)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
5,probably time favorite movie story selflessnes...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea first aired...,negative
8,encouraged positive comment film looking forwa...,negative
9,like original gut wrenching laughter like movi...,positive


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2)

# Count Vectorizer

In [16]:
countVect = CountVectorizer(ngram_range=(1,2), max_features=20000)
countVect.fit(X_train)
trainVectct = countVect.transform(X_train)
testVectct = countVect.transform(X_test)

trainVectctDF = pd.DataFrame(trainVectct.toarray(),columns = countVect.get_feature_names())

In [17]:
trainVectctDF.head(10)

Unnamed: 0,aaron,ab,abandon,abandoned,abbey,abbot,abbott,abbott costello,abc,abducted,...,zombie,zombie film,zombie flick,zombie movie,zone,zoo,zoom,zorro,zucco,zucker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# NAIVE BAYES - SVM - LOGISTIC REGRESSION + CountVect

In [18]:
nbct = naive_bayes.MultinomialNB()
nbct.fit(trainVectct,y_train)
pred = nbct.predict(testVectct)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.864475143692649
Confusion Matrix:
 [[4224  705]
 [ 639 4349]]


In [19]:
svmct = LinearSVC()
svmct.fit(trainVectct,y_train)
pred = svmct.predict(testVectct)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.8577190682666129
Confusion Matrix:
 [[4222  707]
 [ 704 4284]]


In [20]:
lrct = LogisticRegression()
lrct.fit(trainVectct,y_train)
pred = lrct.predict(testVectct)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.8814157507310678
Confusion Matrix:
 [[4330  599]
 [ 577 4411]]


# TF-IDF Vectorizer

In [21]:
tfidfVect = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
tfidfVect.fit(X_train)
trainVect = tfidfVect.transform(X_train)
testVect = tfidfVect.transform(X_test)

trainVectDF = pd.DataFrame(trainVect.toarray(),columns = tfidfVect.get_feature_names())

In [22]:
trainVectDF.head(10)

Unnamed: 0,aaron,ab,abandon,abandoned,abbey,abbot,abbott,abbott costello,abc,abducted,...,zombie,zombie film,zombie flick,zombie movie,zone,zoo,zoom,zorro,zucco,zucker
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.057094,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# NAIVE BAYES - SVM - LOGISTIC REGRESSION + TF-IDF

In [23]:
nb = naive_bayes.MultinomialNB()
nb.fit(trainVect,y_train)
pred = nb.predict(testVect)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.873550468891802
Confusion Matrix:
 [[4222  707]
 [ 547 4441]]


In [24]:
svm = LinearSVC()
svm.fit(trainVect,y_train)
pred = svm.predict(testVect)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.8914994453967934
Confusion Matrix:
 [[4358  571]
 [ 505 4483]]


In [25]:
lr = LogisticRegression()
lr.fit(trainVect,y_train)
pred = lr.predict(testVect)
print("Accuracy: ",metrics.accuracy_score(y_test, pred))
print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

Accuracy:  0.8946253907431683
Confusion Matrix:
 [[4326  603]
 [ 442 4546]]


# FASTTEXT

In [26]:
#Train dosyasının hazırlanması
f = open("fttrain.txt", "w", encoding='utf-8')
for i,y in zip(X_train,y_train):
    f.write("__label__"+y+" "+i.replace('\n','')+"\n")
    

#Test dosyasının hazırlanması    
f = open("fttest.txt", "w", encoding='utf-8')
for i,y in zip(X_test,y_test):
    f.write("__label__"+y+" "+i.replace('\n','')+"\n")
    
f.close()

In [27]:
model = fasttext.train_supervised(input="fttrain.txt",epoch=25, lr=0.5, wordNgrams=2)

dftest = pd.DataFrame(X_test)
dftest["labels"] = y_test
dftest['labels'] = '__label__' + dftest['labels'].astype(str)

dftest["predicted"] = dftest["review"].apply(lambda x: model.predict(x)[0][0])
cfm =confusion_matrix(dftest["labels"], dftest["predicted"])
print("Confusion Matrix:\n",cfm,"\nAccuracy: ",(cfm.diagonal().sum())/cfm.sum().sum())


# model.test("fttest.txt")

Confusion Matrix:
 [[4387  542]
 [ 473 4515]] 
Accuracy:  0.897650499142886


In [28]:
## Using autotune
model = fasttext.train_supervised(input="fttrain.txt",autotuneValidationFile="fttest.txt", autotuneDuration=300)

dftest = pd.DataFrame(X_test)
dftest["labels"] = y_test
dftest['labels'] = '__label__' + dftest['labels'].astype(str)

dftest["predicted"] = dftest["review"].apply(lambda x: model.predict(x)[0][0])
cfm =confusion_matrix(dftest["labels"], dftest["predicted"])
print("Confusion Matrix:\n",cfm,"\nAccuracy: ",(cfm.diagonal().sum())/cfm.sum().sum())


# model.test("fttest.txt")

Confusion Matrix:
 [[4350  579]
 [ 464 4524]] 
Accuracy:  0.8948270646364828
