# Metin Sınıflandırma Modeli

In [1]:
import nltk
import string
import xgboost
import textblob
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from textblob import TextBlob, Word

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df = pd.read_csv("train.tsv", sep = "\t")
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
df["Sentiment"].replace(0, value = "negatif", inplace = True)
df["Sentiment"].replace(1, value = "negatif", inplace = True)
df["Sentiment"].replace(3, value = "pozitif", inplace = True)
df["Sentiment"].replace(4, value = "pozitif", inplace = True)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negatif
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
df = df[(df["Sentiment"] == "negatif") | (df["Sentiment"] == "pozitif")]
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negatif
21,22,1,good for the goose,pozitif
22,23,1,good,pozitif
33,34,1,"the gander , some of which occasionally amuses...",negatif
46,47,1,amuses,pozitif


In [5]:
df.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negatif,34345,34345,34345
pozitif,42133,42133,42133


In [6]:
df = df[["Phrase","Sentiment"]]
df.columns = ["Text","Label"]
df.head()

Unnamed: 0,Text,Label
0,A series of escapades demonstrating the adage ...,negatif
21,good for the goose,pozitif
22,good,pozitif
33,"the gander , some of which occasionally amuses...",negatif
46,amuses,pozitif


In [7]:
df.reset_index(inplace = True)
df.drop("index", axis = 1, inplace = True)
df.head()

Unnamed: 0,Text,Label
0,A series of escapades demonstrating the adage ...,negatif
1,good for the goose,pozitif
2,good,pozitif
3,"the gander , some of which occasionally amuses...",negatif
4,amuses,pozitif


## 1. Metin Ön İşleme

In [8]:
df["Text"] = df["Text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["Text"] = df["Text"].str.replace("[^\w\s]","")
df["Text"] = df["Text"].str.replace("\d","")

sw = stopwords.words("english")
df["Text"] = df["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

sil = pd.Series(" ".join(df["Text"]).split()).value_counts()[-1000:]
df["Text"] = df["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
df["Text"] = df["Text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df.head()

Unnamed: 0,Text,Label
0,series demonstrating adage good goose also goo...,negatif
1,good goose,pozitif
2,good,pozitif
3,occasionally amuses none amount much story,negatif
4,amuses,pozitif


## 2. Değişken Mühendisliği

### A. Train & Test

In [9]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["Text"], df["Label"])

encoder = preprocessing.LabelEncoder()

train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

print(train_y[0:5])
print(test_y[0:5])

[1 1 0 0 1]
[1 0 1 1 1]


### B. Count Vectors

In [10]:
vectorizer = CountVectorizer().fit(train_x)

x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

vectorizer.get_feature_names()[0:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [11]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### C. TD-IDF

In [12]:
tf_idf_word_vectorizer = TfidfVectorizer().fit(train_x)

x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

tf_idf_word_vectorizer.get_feature_names()[0:5]

['aaa', 'aaliyah', 'abagnale', 'abandon', 'abandoned']

In [13]:
x_train_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### D. N-GRAM Level TF-IDF

In [14]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3)).fit(train_x)

x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

tf_idf_ngram_vectorizer.get_feature_names()[0:5]

['aaa action',
 'aaa action xxx',
 'aaliyah one',
 'aaliyah one starring',
 'abagnale antic']

### E. Characters Level TF-IDF

In [15]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3)).fit(train_x)

x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

tf_idf_chars_vectorizer.get_feature_names()[0:5]

[' a', ' aa', ' ab', ' ac', ' ad']

## 3. Makine Öğrenmesi ile Sentiment Sınıflandırması

### A. Lojistik Regresyon

In [16]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.7s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.7s finished


In [17]:
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.836244769874477


In [18]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished


In [19]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)

Word-Level TD-IDF Doğruluk Oranı: 0.8309100418410041


In [20]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.2s finished


In [21]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.7505230125523012


In [22]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    1.2s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.2s finished


In [23]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)

CHARLEVEL TF-IDF Doğruluk Oranı: 0.775836820083682


### B. Naive Bayes

In [24]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [25]:
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.8318514644351463


In [26]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [27]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)

Word-Level TD-IDF Doğruluk Oranı: 0.8300732217573221


In [28]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [29]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.773744769874477


In [30]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars, train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [31]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)

CHARLEVEL TF-IDF Doğruluk Oranı: 0.7486924686192469


### C. Random Forests

In [32]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.2min finished


In [33]:
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.828347280334728


In [34]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.1min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min finished


In [35]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)

Word-Level TD-IDF Doğruluk Oranı: 0.8272489539748953


In [36]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  9.5min remaining:  9.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.5min finished


In [37]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.7563284518828451


In [38]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(rf_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   54.6s remaining:   54.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   55.1s finished


In [39]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)

CHARLEVEL TF-IDF Doğruluk Oranı: 0.8082112970711298


### D. XGBoost

In [40]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_count, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.5s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.7s finished


In [41]:
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.7153765690376569


In [42]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_word, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.1s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.8s finished


In [43]:
print("Word-Level TD-IDF Doğruluk Oranı:", accuracy)

Word-Level TD-IDF Doğruluk Oranı: 0.7123953974895397


In [44]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_ngram, test_y, cv = 10, n_jobs = -1, verbose = 10).mean()



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   37.5s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   37.7s remaining:   37.7s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   37.8s remaining:   16.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   38.1s finished


In [45]:
print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.5901150627615063


In [46]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(xgb_model, x_test_tf_idf_chars, test_y, cv = 10, n_jobs = -1, verbose = 2).mean()



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.2min finished


In [47]:
print("CHARLEVEL TF-IDF Doğruluk Oranı:", accuracy)

CHARLEVEL TF-IDF Doğruluk Oranı: 0.7742677824267783
