# Introduction

The problem is a supervised learning. There are Turkish 5000 complaints belonging to 32 categories in the dataset. I trained four model in two different ways:

- Model-1: In this model, Logistic Regression with SGD and it classifies into 32 categories.
- Model-2: In Model-2, LinerSVC and it classifies into 32 categories.
- Model-3: In Model-3, same model with Model-1 but it classifies according to the Pareto principle (20% of the complaints contribute 80% of the total).
- Model-4: In Model-4, same model with Model-2 but it classifies according to the Pareto principle (20% of the complaints contribute 80% of the total).

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import nltk

from sklearn.feature_extraction.text import  TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Data Processing

In [2]:
df_input = pd.read_excel('input_data.xlsx') # The data where the text data is read.
df_output = pd.read_excel('output_data.xlsx', index_col='gozlem') # the data about which text belongs to which category is read.
df = df_input.join(df_output)
df

Unnamed: 0,Input,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,kategori
0,müşterimiz saat aradığı donama şubesi personel...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,müşterimiz kargosunun alınmadığını belirtiyor ...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,müşteri anlaşmalı müşteri adres alımlardan ürü...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,müşterimiz tazmin talebinin akıbeti ilgili bil...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,adresten alım yapılmaması müşteri alım talebin...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4895,personel kargoyu dükkanın önüne bırakıp gitmiş...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4896,merhaba müşterimiz şube telefonlarına ulaşılam...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
4897,alıcı müşteri dükkanı kapalı telefona cevap ve...,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,8
4898,müşterimize kargo teslim edildi görünüyor faka...,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,9


In [3]:
# unnecessary parts are dropped.
df.drop(columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32], inplace=True) 
df

Unnamed: 0,Input,kategori
0,müşterimiz saat aradığı donama şubesi personel...,1
1,müşterimiz kargosunun alınmadığını belirtiyor ...,2
2,müşteri anlaşmalı müşteri adres alımlardan ürü...,1
3,müşterimiz tazmin talebinin akıbeti ilgili bil...,3
4,adresten alım yapılmaması müşteri alım talebin...,2
...,...,...
4895,personel kargoyu dükkanın önüne bırakıp gitmiş...,1
4896,merhaba müşterimiz şube telefonlarına ulaşılam...,11
4897,alıcı müşteri dükkanı kapalı telefona cevap ve...,8
4898,müşterimize kargo teslim edildi görünüyor faka...,9


In [4]:
import nltk
from nltk.tokenize import RegexpTokenizer # Used to split sentences to make them more understandable for the machine.

!pip install TurkishStemmer # Since our data is in Turkish, TurkishStemmer is installed for cleaning (rooting words). https://github.com/otuncelli/turkish-stemmer-python
from TurkishStemmer import TurkishStemmer 
stemmer = TurkishStemmer()
tokenizer = RegexpTokenizer(r'\w+') # The variable that divides sentences into small expressions is defined.
punct_re=lambda x :" ".join(tokenizer.tokenize(x.lower())) # The variable that convert data to lower case is defined.

nltk.download('stopwords') # Download stopwords from nltk. The simpler the data, the better. Therefore, it is desirable to get rid of unnecessary words (stopwords) that do not add meaning to the sentence.

stop_word_list = nltk.corpus.stopwords.words('turkish')

def stopword_extraction(values):
    wordFilter = [word for word in values.split() if word not in stop_word_list]
    notStopword = " ".join(wordFilter)
    return notStopword

df["Input"]=df["Input"].apply(punct_re)
df["Input"]=df["Input"].apply(lambda x : " ".join([stemmer.stem(w) for w in x.split()]))
df['Input'] = df['Input'].apply(lambda x: stopword_extraction(x))
df

ERROR: Invalid requirement: '#'
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\onur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Input,kategori
0,müşter saat aradık dona şube persone taraf küs...,1
1,müşter kargo alınmadık belirtiyor konu şikayet...,2
2,müşter anlaşma müşter adres alım ürün alınmıyo...,1
3,müşter tazmi talep akıbe ilgi bilgi almak ister,3
4,adres al yapılmama müşter al talep gerçekleşme...,2
...,...,...
4895,personel kargo dükka önün bırakıp git sigar ma...,1
4896,merhap müşter şube telefon ulaşılamama dola şi...,11
4897,alıç müşter dükkan kapa telefon cevap verm kur...,8
4898,müşter kargo tesl edil görünüyor fakat müşter ...,9


In [5]:
dataDoc1 = df['Input'].values.tolist()
dataClass1 = df['kategori'].values.tolist() 

x_train1, x_test1, y_train1, y_test1 = train_test_split(dataDoc1, dataClass1, test_size = 0.20, random_state = 42)

tfidf_vectorizer = TfidfVectorizer(min_df=3) # min_df: Used to ignore terms that rarely appear. Currently, if a term occurs in less than 3 documents, it will be ignored.

# text data is converted to vectors in a language the machine can understand.
x_train_tfidf1 = tfidf_vectorizer.fit_transform(x_train1)
x_test_tfidf1 = tfidf_vectorizer.transform(x_test1)

# Model-1

In [6]:
from sklearn.linear_model import SGDClassifier
lrsgd = SGDClassifier(loss="log", max_iter=1000, alpha=0.0001, random_state=42) 
lrsgd_clf1 = lrsgd.fit(x_train_tfidf1, y_train1) 
pred_test_lrsgd1 = lrsgd_clf1.predict(x_test_tfidf1)

# The training score and the test score are compared to see if there has been overfitting in our model. If the values are close, it is said that there is no overfitting.
pred_train_lrsgd1 = lrsgd_clf1.predict(x_train_tfidf1) # x training data is also estimated for comparison.
print('SGD''li Lojistik Regresyon Model eğitim veri seti doğruluğu: {0:0.4f}'. format(metrics.accuracy_score(y_train1, pred_train_lrsgd1)))

print('SGD''li Lojistik Regresyon Model test veri seti doğruluğu: {0:0.4f}'.format(metrics.accuracy_score(y_test1, pred_test_lrsgd1)))

SGDli Lojistik Regresyon Model eğitim veri seti doğruluğu: 0.8617
SGDli Lojistik Regresyon Model test veri seti doğruluğu: 0.7918


In [7]:
print(metrics.classification_report(y_true = y_test1, y_pred=pred_test_lrsgd1, zero_division=0))

              precision    recall  f1-score   support

           1       0.64      0.89      0.74       178
           2       0.95      0.91      0.93        78
           3       1.00      0.83      0.91         6
           4       0.98      0.98      0.98        44
           6       0.94      0.95      0.95       155
           7       0.88      0.88      0.88        76
           8       0.61      0.86      0.71       100
           9       0.78      0.39      0.52        93
          10       0.81      0.80      0.80        44
          11       0.99      0.99      0.99        85
          12       0.00      0.00      0.00         1
          13       0.86      0.50      0.63        12
          14       0.75      0.60      0.67        10
          15       0.44      0.67      0.53         6
          16       0.00      0.00      0.00        14
          17       0.76      0.76      0.76        17
          18       0.00      0.00      0.00         2
          19       1.00    

# Model-2

In [8]:
from sklearn.svm import LinearSVC 
lrsvc = LinearSVC(C=0.2, random_state=42)
lrsvc_clf1 = lrsvc.fit(x_train_tfidf1, y_train1)
pred_test_lrsvc1 = lrsvc_clf1.predict(x_test_tfidf1)

# The training score and test score are compared to see if our model has overfitted. If the values are close, it is said that there is no overfitting.
pred_train_lrsvc1 = lrsvc_clf1.predict(x_train_tfidf1) # x training data is also estimated for comparison.
print('lineer svc Model eğitim veri seti doğruluğu: {0:0.4f}'. format(metrics.accuracy_score(y_train1, pred_train_lrsvc1)))

print('lineer svc Model test veri seti doğruluğu: {0:0.4f}'.format(metrics.accuracy_score(y_test1, pred_test_lrsvc1)))

lineer svc Model eğitim veri seti doğruluğu: 0.8696
lineer svc Model test veri seti doğruluğu: 0.7888


In [9]:
print(metrics.classification_report(y_true = y_test1, y_pred=pred_test_lrsvc1, zero_division=0))

              precision    recall  f1-score   support

           1       0.65      0.86      0.74       178
           2       0.96      0.91      0.93        78
           3       0.83      0.83      0.83         6
           4       0.98      0.98      0.98        44
           6       0.94      0.95      0.95       155
           7       0.88      0.88      0.88        76
           8       0.62      0.69      0.65       100
           9       0.64      0.54      0.58        93
          10       0.80      0.84      0.82        44
          11       0.99      0.99      0.99        85
          12       0.00      0.00      0.00         1
          13       0.75      0.50      0.60        12
          14       0.75      0.60      0.67        10
          15       0.44      0.67      0.53         6
          16       0.00      0.00      0.00        14
          17       0.75      0.88      0.81        17
          18       0.00      0.00      0.00         2
          19       1.00    

In [10]:
# In order to increase the accuracy and speed of the training, the 10 complaint category parts corresponding to 80% of the complaints are taken and the rest is discarded.

df.drop(df[(df.kategori != 1) & (df.kategori != 6) & (df.kategori != 2) & (df.kategori != 7) & (df.kategori != 8) & (df.kategori != 9) & (df.kategori != 10) & (df.kategori != 11) & (df.kategori != 4) & (df.kategori != 17)].index, inplace=True) 
# Alternative: df = df[df['Input'].map(len) > 10]
df.reset_index(inplace=True)
df.set_index('index')
df.drop(columns=['index'], inplace=True)

dataDoc2 = df['Input'].values.tolist()
dataClass2 = df['kategori'].values.tolist() # category data is selected as output.

x_train2, x_test2, y_train2, y_test2 = train_test_split(dataDoc2, dataClass2, test_size = 0.20, random_state = 42)

x_train_tfidf2 = tfidf_vectorizer.fit_transform(x_train2)
x_test_tfidf2= tfidf_vectorizer.transform(x_test2)

# Model-3

In [11]:
from sklearn.linear_model import SGDClassifier
lrsgd_clf2 = lrsgd.fit(x_train_tfidf2, y_train2) 
pred_test_lrsgd2 = lrsgd_clf2.predict(x_test_tfidf2)

pred_train_lrsgd2 = lrsgd_clf2.predict(x_train_tfidf2)
print('SGD''li Lojistik Regresyon Model eğitim veri seti doğruluğu: {0:0.4f}'. format(metrics.accuracy_score(y_train2, pred_train_lrsgd2)))

print('SGD''li Lojistik Regresyon Model test veri seti doğruluğu: {0:0.4f}'.format(metrics.accuracy_score(y_test2, pred_test_lrsgd2)))

print(metrics.classification_report(y_true = y_test2, y_pred=pred_test_lrsgd2, zero_division=0))

SGDli Lojistik Regresyon Model eğitim veri seti doğruluğu: 0.8975
SGDli Lojistik Regresyon Model test veri seti doğruluğu: 0.8584
              precision    recall  f1-score   support

           1       0.83      0.96      0.89       172
           2       1.00      0.87      0.93        84
           4       0.97      1.00      0.98        28
           6       0.95      0.96      0.96       139
           7       0.99      0.93      0.96        94
           8       0.64      0.53      0.58       101
           9       0.54      0.63      0.58        82
          10       0.93      0.87      0.90        45
          11       0.97      0.97      0.97       102
          17       1.00      0.72      0.84        29

    accuracy                           0.86       876
   macro avg       0.88      0.84      0.86       876
weighted avg       0.86      0.86      0.86       876



# Model-4

In [12]:
from sklearn.svm import LinearSVC 
lrsvc_clf2 = lrsvc.fit(x_train_tfidf2, y_train2) 
pred_test_lrsvc2 = lrsvc_clf2.predict(x_test_tfidf2)

pred_train_lrsvc2 = lrsvc_clf2.predict(x_train_tfidf2)
print('lineer svc Model eğitim veri seti doğruluğu: {0:0.4f}'. format(metrics.accuracy_score(y_train2, pred_train_lrsvc2)))

print('lineer svc Model test veri seti doğruluğu: {0:0.4f}'.format(metrics.accuracy_score(y_test2, pred_test_lrsvc2)))

print(metrics.classification_report(y_true = y_test2, y_pred=pred_test_lrsvc2, zero_division=0))

lineer svc Model eğitim veri seti doğruluğu: 0.8969
lineer svc Model test veri seti doğruluğu: 0.8699
              precision    recall  f1-score   support

           1       0.82      0.95      0.88       172
           2       1.00      0.87      0.93        84
           4       0.96      0.93      0.95        28
           6       0.95      0.96      0.95       139
           7       0.99      0.93      0.96        94
           8       0.62      0.89      0.73       101
           9       0.78      0.35      0.49        82
          10       0.91      0.89      0.90        45
          11       0.97      0.97      0.97       102
          17       1.00      0.72      0.84        29

    accuracy                           0.87       876
   macro avg       0.90      0.85      0.86       876
weighted avg       0.88      0.87      0.86       876

