In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from sklearn.model_selection import cross_val_score
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy
import re,string,unicodedata
import warnings
from sklearn.svm import SVC
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import fasttext
from sklearn.metrics import confusion_matrix

In [2]:
# nltk.download('punkt')   # one time execution
# nltk.download('stopwords')  # one time execution
# nltk.download('wordnet') # one time execution

# ÖNİŞLEME

In [3]:
df = pd.read_csv('Data.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
df.value_counts('sentiment')

sentiment
positive    25000
negative    25000
dtype: int64

In [6]:
def cleaninTxt(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

In [7]:
df['review']=df['review'].apply(cleaninTxt)

In [8]:
lemmatizer = WordNetLemmatizer() 
def stemTxt(text):
    global lemmatizer
    
    newtxt = ''
    for i in text.split():
        if i not in stopwords.words('english'):
            newtxt+=lemmatizer.lemmatize(i)+' '
    
    return newtxt

In [9]:
## Kelimelerin köklerini bulma ve stopwordleri temizleme örneği
a = df['review'].get(1)
print(a,'\n\n\n')

a = stemTxt(a)
print(a)

a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done  



wonderful little production filmi

In [10]:
df['review']=df['review'].apply(stemTxt)


In [11]:
df.to_csv("processed.csv")

In [12]:
df.head(10)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
5,probably time favorite movie story selflessnes...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea first aired...,negative
8,encouraged positive comment film looking forwa...,negative
9,like original gut wrenching laughter like movi...,positive


In [3]:
df = pd.read_csv("processed.csv") # sonraki kullanımlar için 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2) # FastText için

# Count Vectorizer

In [5]:
countVect = CountVectorizer(ngram_range=(1,2))
# countVect.fit(X_train)
# trainVectct = countVect.transform(X_train)
# testVectct = countVect.transform(X_test)

cvAll = countVect.fit_transform(df["review"].values)

# NAIVE BAYES - SVM - LOGISTIC REGRESSION + CountVect

In [6]:
nbct = naive_bayes.MultinomialNB()
# nbct.fit(trainVectct,y_train)
# pred = nbct.predict(testVectct)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(nbct, cvAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.8841 0.8893 0.8814 0.8802 0.8864] 
Accuracy:  0.8842799999999998


In [7]:
svmct = LinearSVC()
# svmct.fit(trainVectct,y_train)
# pred = svmct.predict(testVectct)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(svmct, cvAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.9015 0.8962 0.8962 0.8964 0.9001] 
Accuracy:  0.8980799999999999


In [8]:
cVals = [0.001,0.01,0.1,10,100,1000]

for i in cVals:
    svmct = LinearSVC(C = i)
    scores = cross_val_score(svmct, cvAll , df["sentiment"].values, cv=5)
    print("C value: ",i," Scores: ",scores,"\nAccuracy: ",scores.mean(),"\n")

C value:  0.001  Scores:  [0.8988 0.8942 0.8892 0.8952 0.8935] 
Accuracy:  0.8941800000000001 

C value:  0.01  Scores:  [0.9074 0.8986 0.8989 0.8985 0.9026] 
Accuracy:  0.9012 

C value:  0.1  Scores:  [0.9028 0.8958 0.8968 0.8973 0.9008] 
Accuracy:  0.8987 

C value:  10  Scores:  [0.9012 0.896  0.8965 0.8957 0.8998] 
Accuracy:  0.8978400000000001 

C value:  100  Scores:  [0.9012 0.8961 0.8966 0.8957 0.8999] 
Accuracy:  0.8978999999999999 

C value:  1000  Scores:  [0.9012 0.8961 0.8966 0.8957 0.8999] 
Accuracy:  0.8978999999999999 



In [9]:
lrct = LogisticRegression()
# lrct.fit(trainVectct,y_train)
# pred = lrct.predict(testVectct)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(lrct, cvAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.9051 0.8986 0.8992 0.8987 0.9017] 
Accuracy:  0.90066


# TF-IDF Vectorizer

In [10]:
tfidfVect = TfidfVectorizer(ngram_range=(1,2))
# tfidfVect.fit(X_train)
# trainVect = tfidfVect.transform(X_train)
# testVect = tfidfVect.transform(X_test)

tfAll = tfidfVect.fit_transform(df["review"].values)

# NAIVE BAYES - SVM - LOGISTIC REGRESSION + TF-IDF

In [11]:
nb = naive_bayes.MultinomialNB()
# nb.fit(trainVect,y_train)
# pred = nb.predict(testVect)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(nb, tfAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.8848 0.8931 0.8865 0.8839 0.8897] 
Accuracy:  0.8876000000000002


In [12]:
svm = LinearSVC()
# svm.fit(trainVect,y_train)
# pred = svm.predict(testVect)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(svm, tfAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.91   0.9071 0.9052 0.906  0.9088] 
Accuracy:  0.9074199999999999


In [13]:
for i in cVals:
    svm = LinearSVC(C = i)
    scores = cross_val_score(svm, tfAll , df["sentiment"].values, cv=5)
    print("C value: ",i," Scores: ",scores,"\nAccuracy: ",scores.mean(),"\n")

C value:  0.001  Scores:  [0.8241 0.8267 0.8247 0.8168 0.8269] 
Accuracy:  0.82384 

C value:  0.01  Scores:  [0.8545 0.8578 0.8546 0.8488 0.8559] 
Accuracy:  0.8543200000000001 

C value:  0.1  Scores:  [0.8943 0.8951 0.8903 0.8921 0.8917] 
Accuracy:  0.8926999999999999 

C value:  10  Scores:  [0.9119 0.9087 0.9083 0.9076 0.9109] 
Accuracy:  0.90948 

C value:  100  Scores:  [0.9119 0.9085 0.909  0.908  0.9118] 
Accuracy:  0.90984 

C value:  1000  Scores:  [0.912  0.9085 0.9091 0.9081 0.9116] 
Accuracy:  0.9098600000000001 



In [14]:
lr = LogisticRegression()
# lr.fit(trainVect,y_train)
# pred = lr.predict(testVect)
# print("Accuracy: ",metrics.accuracy_score(y_test, pred))
# print("Confusion Matrix:\n",confusion_matrix(y_test, pred))

scores = cross_val_score(lr, tfAll , df["sentiment"].values, cv=5)
print("Scores: ",scores,"\nAccuracy: ",scores.mean())

Scores:  [0.8923 0.8938 0.8887 0.8913 0.8904] 
Accuracy:  0.8913


# FASTTEXT

In [23]:
#Train dosyasının hazırlanması
f = open("fttrain.txt", "w", encoding='utf-8')
for i,y in zip(X_train,y_train):
    f.write("__label__"+y+" "+i.replace('\n','')+"\n")
    

#Test dosyasının hazırlanması    
f = open("fttest.txt", "w", encoding='utf-8')
for i,y in zip(X_test,y_test):
    f.write("__label__"+y+" "+i.replace('\n','')+"\n")
    
f.close()

In [24]:
model = fasttext.train_supervised(input="fttrain.txt",autotuneValidationFile="fttest.txt", autotuneDuration=3600)

# dftest = pd.DataFrame(X_test)
# dftest["labels"] = y_test
# dftest['labels'] = '__label__' + dftest['labels'].astype(str)

# dftest["predicted"] = dftest["review"].apply(lambda x: model.predict(x)[0][0])
# cfm =confusion_matrix(dftest["labels"], dftest["predicted"])
# print("Confusion Matrix:\n",cfm,"\nAccuracy: ",(cfm.diagonal().sum())/cfm.sum().sum())

model.test("fttest.txt")

(10000, 0.9067, 0.9067)