### Kütüphane Yüklenmesi

In [1]:
#Gerekli kütüphaneler
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cihat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data Hazırlanması

In [2]:
#HTML taglerinden vs. arındırmak için oluşturulan fonksiyon
def preprocess_tweet(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text

In [3]:
#Veri setinin pandas kütüphanesi ile yüklenmesi ve görüntülenmesi
df = pd.read_csv('suicidal_data.csv')
df.head()

Unnamed: 0,label,tweet
0,1,my life is meaningless i just want to end my l...
1,1,muttering i wanna die to myself daily for a fe...
2,1,work slave i really feel like my only purpose ...
3,1,i did something on the 2 of october i overdose...
4,1,i feel like no one cares i just want to die ma...


In [4]:
#Data setin incelenmesi
df.describe()

Unnamed: 0,label
count,9119.0
mean,0.438425
std,0.496221
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [5]:
#Oluşturulan fonksiyonun tweetlere uygulanması
df['tweet'] = df['tweet'].apply(preprocess_tweet)
df['tweet'].head()

0    my life is meaningless i just want to end my l...
1    muttering i wanna die to myself daily for a fe...
2    work slave i really feel like my only purpose ...
3    i did something on the 2 of october i overdose...
4    i feel like no one cares i just want to die ma...
Name: tweet, dtype: object

In [6]:
#Kelimelerdeki ekleri ayırmak ve kelimeleri sade hale getirmek için PorterStemmer import edilmesi
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
#Kelimeleri sadeleştirmek için fonksiyon oluşturulması
def splitter(text):
    return [porter.stem(word) for word in text.split()]

In [7]:
#Stopwordsleri eleme işlemi. Stopword(A ,the,can,be gibi ek kelimeler )
from nltk.corpus import stopwords
#stop değişkeni içine ingilizcede bulunan stop wordlerin yerleştirilmesi
stop = stopwords.words('english')

In [8]:
#fonksiyonun doğru çalıştığının kontrolü
[w for w in splitter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [9]:
#Noktalama işaretleri için ayıklama fonksiyonu
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in splitter(text) if w not in stop]
    return tokenized

In [10]:
#Ayıklama fonksiyonunun uygulanması
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

### Model Oluşturma

### Kullanılacak Modeller
### 1-Decision Tree
### 2-Naive Bayes
### 3-Random Forest
### 4-SVM

### Decision Tree

In [11]:
#Decision tree model import
from sklearn.tree import DecisionTreeClassifier 
clf = DecisionTreeClassifier(criterion='entropy')

In [12]:
#Dataframe ayrışması X ve y olarak
X = df["tweet"].to_list()
y = df['label']

In [13]:
#Train ve test olarak 0.2 test olacak şekilde ayarlama
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=0)

In [14]:
#Train ve test datalarının sayıya çevrilmesi
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [15]:
print(X_train)

  (0, 151219)	0.2672612419124244
  (0, 154172)	-0.2672612419124244
  (0, 228212)	0.2672612419124244
  (0, 661528)	0.2672612419124244
  (0, 694123)	-0.2672612419124244
  (0, 710246)	0.2672612419124244
  (0, 784864)	0.2672612419124244
  (0, 976248)	0.2672612419124244
  (0, 1322294)	0.2672612419124244
  (0, 1471014)	-0.2672612419124244
  (0, 1596840)	-0.2672612419124244
  (0, 1758113)	0.2672612419124244
  (0, 1832641)	-0.2672612419124244
  (0, 2024407)	-0.2672612419124244
  (1, 338426)	0.5
  (1, 593689)	-0.5
  (1, 744205)	0.5
  (1, 1934985)	0.5
  (2, 32040)	0.22360679774997896
  (2, 112243)	-0.22360679774997896
  (2, 427915)	-0.22360679774997896
  (2, 623124)	-0.22360679774997896
  (2, 963085)	-0.22360679774997896
  (2, 976011)	-0.22360679774997896
  (2, 1047103)	0.22360679774997896
  :	:
  (7294, 1526592)	-0.4629100498862757
  (7294, 1567522)	0.07715167498104596
  (7294, 1622622)	0.07715167498104596
  (7294, 1643429)	0.07715167498104596
  (7294, 1644930)	0.07715167498104596
  (7294, 1670

In [16]:
#Train datayı modele oturtmak
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [17]:
#Accuracy ölçümü
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.874


In [18]:
y_pred = clf.predict(X_test)
#Performans metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
per_class_precision = precision_score(y_pred, y_test, average=None)
print('Per-class precision score:', per_class_precision)
per_class_recall = recall_score(y_pred, y_test, average=None)
print('Per-class recall score:', per_class_recall)
per_class_recall_f1_score = f1_score(y_pred, y_test, average=None)
print('Per-class f1 score:', per_class_recall_f1_score)

Per-class precision score: [0.90188679 0.83638743]
Per-class recall score: [0.88436633 0.86002692]
Per-class f1 score: [0.89304064 0.84804247]


### Test ve Tahminde bulunma

In [19]:
#0 sınıfı negative 1 sınıfı pozitif tweetleri içeriyor. Atılan tweetin pozitif mi negatif mi olduğunu tespit edebiliyoruz
label = {0:'negative', 1:'positive'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 100.00%


In [20]:
label = {0:'negative', 1:'positive'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

Prediction: negative
Probability: 100.00%


### Random Forest Classifier

In [21]:
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [22]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

RandomForestClassifier()

In [24]:
#Gridsearchcv metodu ile hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6],
                         'max_features': ['auto', 'sqrt', 'log2']})

In [25]:
y_pred = CV_rfc.predict(X_test)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
per_class_precision = precision_score(y_pred, y_test, average=None)
print('Per-class precision score:', per_class_precision)
per_class_recall = recall_score(y_pred, y_test, average=None)
print('Per-class recall score:', per_class_recall)
per_class_recall_f1_score = f1_score(y_pred, y_test, average=None)
print('Per-class f1 score:', per_class_recall_f1_score)

Per-class precision score: [1.         0.35863874]
Per-class recall score: [0.68387097 1.        ]
Per-class f1 score: [0.81226054 0.52793834]


### Test ve Tahminde bulunma

In [34]:
#0 sınıfı negative 1 sınıfı pozitif tweetleri içeriyor. Atılan tweetin pozitif mi negatif mi olduğunu tespit edebiliyoruz
label = {0:'negative', 1:'positive'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[CV_rfc.predict(X)[0]],np.max(CV_rfc.predict_proba(X))*100))

Prediction: negative
Probability: 58.51%


In [35]:
label = {0:'negative', 1:'positive'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[CV_rfc.predict(X)[0]],np.max(CV_rfc.predict_proba(X))*100))

Prediction: negative
Probability: 59.99%


In [29]:
from sklearn.svm import SVC  
svm = SVC(kernel='linear') 

### SUPPORT VECTOR MACHINE

In [30]:
svm.fit(X_train,y_train)

SVC(kernel='linear')

In [31]:
y_pred = svm.predict(X_test)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
per_class_precision = precision_score(y_pred, y_test, average=None)
print('Per-class precision score:', per_class_precision)
per_class_recall = recall_score(y_pred, y_test, average=None)
print('Per-class recall score:', per_class_recall)
per_class_recall_f1_score = f1_score(y_pred, y_test, average=None)
print('Per-class f1 score:', per_class_recall_f1_score)

Per-class precision score: [0.95188679 0.86649215]
Per-class recall score: [0.90819082 0.92847125]
Per-class f1 score: [0.92952556 0.89641165]


### Test ve Tahminde bulunma

In [39]:
#0 sınıfı negative 1 sınıfı pozitif tweetleri içeriyor. Atılan tweetin pozitif mi negatif mi olduğunu tespit edebiliyoruz
label = {0:'negative', 1:'positive'}
example = ["I'll kill myself am tired of living depressed and alone"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[svm.predict(X)[0]],np.max(svm.predict(X))*100))

Prediction: positive
Probability: 100.00%


In [40]:
label = {0:'negative', 1:'positive'}
example = ["It's such a hot day, I'd like to have ice cream and visit the park"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
      %(label[svm.predict(X)[0]],np.max(svm.predict(X))*100))

Prediction: negative
Probability: 0.00%
