# Model Eğitimi - Tensorflow

In [1]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd 
data = pd.read_csv("Data_Train.csv")

In [3]:
data.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [4]:
data["SECTION"].replace(0, value = "politics", inplace = True)
data["SECTION"].replace(1, value = "technology", inplace = True)
data["SECTION"].replace(2, value = "entertainment", inplace = True)
data["SECTION"].replace(3, value = "business", inplace = True)

In [5]:
data.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,business
1,How formidable is the opposition alliance amon...,politics
2,Most Asian currencies were trading lower today...,business
3,"If you want to answer any question, click on ‘...",technology
4,"In global markets, gold prices edged up today ...",business


In [6]:
# data = data[(data.category == "negatif") | (data.category == "pozitif")]

In [7]:
data.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,business
1,How formidable is the opposition alliance amon...,politics
2,Most Asian currencies were trading lower today...,business
3,"If you want to answer any question, click on ‘...",technology
4,"In global markets, gold prices edged up today ...",business


In [8]:
data.groupby("SECTION").count()

Unnamed: 0_level_0,STORY
SECTION,Unnamed: 1_level_1
business,1246
entertainment,1924
politics,1686
technology,2772


In [9]:
df = pd.DataFrame()
df["text"] = data["STORY"]
df["label"] = data["SECTION"]

In [10]:
df.head()

Unnamed: 0,text,label
0,But the most painful was the huge reversal in ...,business
1,How formidable is the opposition alliance amon...,politics
2,Most Asian currencies were trading lower today...,business
3,"If you want to answer any question, click on ‘...",technology
4,"In global markets, gold prices edged up today ...",business


## Metin Ön İşleme

In [11]:
#buyuk-kucuk donusumu
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#noktalama işaretleri
df['text'] = df['text'].str.replace('[^\w\s]','')
#sayılar
df['text'] = df['text'].str.replace('\d','')
#stopwords
import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
#seyreklerin silinmesi
sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

## Değişken Mühendisliği

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings

TF(t) = (Bir t teriminin bir dökümanda gözlenme frekansı) / (dökümandaki toplam terim sayısı) 

IDF(t) = log_e(Toplam döküman sayısı / içinde t terimi olan belge sayısı)


In [12]:
df.head()

Unnamed: 0,text,label
0,painful huge reversal fee income unheard among...,business
1,formidable opposition alliance among congress ...,politics
2,asian currency trading lower today south korea...,business
3,want answer question click answer clicking ans...,technology
4,global market gold price edged today disappoin...,business


In [13]:
df.iloc[0]

text     painful huge reversal fee income unheard among...
label                                             business
Name: 0, dtype: object

## Test-Train

In [14]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"], 
                                                                    random_state = 1)

In [15]:
train_y[0:5]

611     entertainment
588          politics
3135         politics
6728         politics
7347         politics
Name: label, dtype: object

In [16]:
encoder = preprocessing.LabelEncoder()

In [17]:
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [18]:
train_y[0:5]

array([1, 2, 2, 2, 2])

In [19]:
test_y[0:5]

array([3, 1, 3, 1, 0])

### Count Vectors

In [20]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x.astype('U').values)

CountVectorizer()

In [21]:
x_train_count = vectorizer.transform(train_x.astype('U').values)
x_test_count = vectorizer.transform(test_x.astype('U').values)

In [22]:
vectorizer.get_feature_names()[0:5]

['aa', 'aaa', 'aadhaar', 'aadhaarenabled', 'aadhar']

In [23]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### TF-IDF

In [24]:
#wordlevel

In [25]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x.astype('U').values)

TfidfVectorizer()

In [26]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x.astype('U').values)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x.astype('U').values)

In [27]:
tf_idf_word_vectorizer.get_feature_names()[0:5]

['aa', 'aaa', 'aadhaar', 'aadhaarenabled', 'aadhar']

In [28]:
x_train_tf_idf_word.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.10983767, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [29]:
# ngram level tf-idf

In [30]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x.astype('U').values)

TfidfVectorizer(ngram_range=(2, 3))

In [31]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x.astype('U').values)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x.astype('U').values)

In [32]:
# characters level tf-idf

In [33]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(train_x.astype('U').values)

TfidfVectorizer(analyzer='char', ngram_range=(2, 3))

In [34]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x.astype('U').values)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x.astype('U').values)

# Makine Öğrenmesi ile Sentiment Sınıflandırması

## Lojistik Regresyon

In [51]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.9418049049324881


In [36]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.9496555524937997


In [37]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.6334665197023973


In [38]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.9402287131441168


## Naive Bayes

In [39]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.9475833562965003


In [40]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.8474125103334252


In [41]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.6418682832736291


In [42]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.582083218517498


## Random Forests

In [43]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.9360236979884264


In [44]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.9323753100027556


In [45]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.6334665197023973


In [46]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.9402287131441168


## XGBoost

In [47]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.926048498208873


In [48]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.9150344447506201


In [49]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_ngram, 
                                           test_y, 
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.7493524386883439


In [50]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(xgb_model, 
                                           x_test_tf_idf_chars, 
                                           test_y, 
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.9124386883438962


In [52]:
loj_model

LogisticRegression()

In [62]:
# yeni_yorum = pd.Series("But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront")

# yeni_yorum = pd.Series("want answer question click answer clicking")

# yeni_yorum = pd.Series("How formidable is the opposition alliance among Congress, Jharkhand Mukti Morcha (JMM) and Jharkhand Vikas Morcha (Prajatantrik)?,0")

# yeni_yorum = pd.Series("what you have learned yours and only yours")

In [63]:
v = CountVectorizer()
v.fit(train_x.astype('U').values)
yeni_yorum = v.transform(yeni_yorum)

In [64]:
loj_model.predict(yeni_yorum)

array([1])

In [96]:
nb_model

MultinomialNB()

In [97]:
nb_model.predict(yeni_yorum)

array([2])

In [98]:
rf_model

RandomForestClassifier()

In [99]:
rf_model.predict(yeni_yorum)

array([2])

In [146]:
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [143]:
xgb_model.predict(yeni_yorum)

array([2])