In [1]:
import pandas as pd
import numpy as np

# PRE-PROCESSING
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

# TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# TRAIN/TEST SET
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 

nlp = spacy.load("el_core_news_sm")

# testing  = pd.read_csv('TWEETS_nolinks.csv')
testing  = pd.read_csv('CHATGPT TWEETS_nolinks.csv')

# Replace empty strings with NaN
testing['class'] = testing['class'].replace(' ', np.nan)
final = testing[testing['class'].notnull()]
final = final.reset_index()
final = final[['Tweet', 'class']]
# Use astype to convert to float
final['class'] = final['class'].astype(float)
final.head()


Unnamed: 0,Tweet,class
0,Η απορία της ημέρας!!\n\nΓιατί σε φωτογραφία π...,0.0
1,Ρε κόψτε την σύνδεση με τον βλακα Καραγιάννη έ...,1.0
2,Ότι ήσουν μαλάκω το ξέραμε. Ότι ήσουν αμπαλη ...,1.0
3,ΠΟΙΟΣ ΓΑΜΙΟΛΗΣ ΚΡΥΒΕΤΑΙ ΠΙΣΩ ΑΠΤΗΝΕ ΑΝΑΠΑΤΕΧΗ ...,1.0
4,Βλεπω αυτόν τον πατέρα στην Σκατιανα και μου έ...,1.0


In [2]:
def replaceMultiple(main, replacements, new):
    for elem in replacements:
        if elem in main:
            main = main.replace(elem, new)

    return main


def normalize(x):
    x = x.replace('ά', 'α')
    x = x.replace('έ', 'ε')
    x = x.replace('ή', 'η')
    x = replaceMultiple(x, ['ί', 'ΐ', 'ϊ'], 'ι')
    x = x.replace('ό', 'ο')
    x = replaceMultiple(x, ['ύ', 'ΰ', 'ϋ'], 'υ')
    x = x.replace('ώ', 'ω')
    return x

def remove_stopwords(text):
    stop_words = stopwords.words('greek')
 
    imp_words = []
 
    # Storing the important words
    for word in str(text).split():
 
        if word not in stop_words:
 
            # Let's Lemmatize the word as well -- to fernw stin arxiki tou morfi
#             before appending to the imp_words list.
 
            lemmatizer = WordNetLemmatizer()
            lemmatizer.lemmatize(word) #doesnt really work
 
            imp_words.append(word)
 
    output = " ".join(imp_words)
 
    return output


punctuations_list = string.punctuation
def remove_punctuations(text):
    temp = str.maketrans('', '', punctuations_list)
    return text.translate(temp)

def text_processing(text): 
    text = normalize(text)  #diwxnw tonous
    text = text.lower() #ola mikra
    text = remove_punctuations(text) #diwxnw punctiations
    text = remove_stopwords(text) #diwxnw stopwords
#     text  = " ".join([w.lemma_ for w in nlp(text)]) #lemmatization is not really working well either
    return text
#     return [word for word in text.split() ] #tokenization



In [3]:
final['Tweet'] = final['Tweet'].apply(lambda x: text_processing(x))

In [4]:
Tfidf_vect = TfidfVectorizer(ngram_range = (2, 2)) 
#maximum of 2000 unique words/features as we have set parameter max_features=2000
final_tdidf = Tfidf_vect.fit_transform(final['Tweet'])

print(len(Tfidf_vect.vocabulary_)) 
#decide that many cause after was over analysing and 
# separating wrongly words

9665


In [5]:
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
for i, (train_index, test_index) in enumerate(kfold.split(final_tdidf, final['class'])):
        X_train, X_test, Y_train, Y_test = final_tdidf[train_index], final_tdidf[test_index], final['class'][train_index], final['class'][test_index]

# SVM

In [6]:
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale'],
              'kernel': ['rbf', 'linear']} 

# grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
grid = RandomizedSearchCV(svm.SVC(class_weight='balanced'),  param_grid, cv=3, random_state=42, refit = True, verbose = 2)

# fitting the model for grid search
grid.fit(X_train, Y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .................C=100, gamma=0.0001, kernel=linear; total time=   0.0s
[CV] END .................C=100, gamma=0.0001, kernel=linear; total time=   0.0s
[CV] END .................C=100, gamma=0.0001, k

In [7]:
grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

         0.0       0.65      0.96      0.77        57
         1.0       0.92      0.44      0.60        54

    accuracy                           0.71       111
   macro avg       0.79      0.70      0.69       111
weighted avg       0.78      0.71      0.69       111



# Multinomial Naïve Bayes (works with occurrence counts) 

In [8]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,Y_train)
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Y_test)*100)

# print classification report
print(classification_report(Y_test, predictions_NB))

Naive Bayes Accuracy Score ->  70.27027027027027
              precision    recall  f1-score   support

         0.0       0.65      0.93      0.76        57
         1.0       0.86      0.46      0.60        54

    accuracy                           0.70       111
   macro avg       0.75      0.70      0.68       111
weighted avg       0.75      0.70      0.68       111



Multinomial Naïve Bayes (works with occurrence counts) - 2 classes

In [12]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,Y_train_2)
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test)
# Use accuracy_score function to get the accuracy
# print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Y_test_2)*100)

# print classification report
print(classification_report(Y_test_2, predictions_NB))

              precision    recall  f1-score   support

         0.0       0.70      1.00      0.82       584
         1.0       0.50      0.00      0.01       253

    accuracy                           0.70       837
   macro avg       0.60      0.50      0.41       837
weighted avg       0.64      0.70      0.58       837



# Random Forest 

In [9]:
# defining parameter range

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 300, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion': ['gini', 'entropy'],
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(random_state =0, class_weight='balanced')

grid = RandomizedSearchCV(RandomForest, random_grid, cv=3, random_state=42, refit = True, verbose = 2)
# fitting the model for grid search
grid.fit(X_train, Y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(Y_test, grid_predictions))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, criterion=gini, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=gini, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=gini, max_depth=100, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, criterion=entropy, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=222; total time=   0.2s
[CV] END bootstrap=False, criterion=entropy, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=222; total time=   0.3s
[CV] END bootstrap=False, criterion=entropy, max_depth=60, min_samples_leaf=4, min_samples_split=10, n_estimators=222; total time=   0.2s
[CV] END bootstrap=False, criterion=gini, max_depth=110, min_samples_leaf=2, min_samples_split=2, n_estimators=222; tota

# SGDClassifier 

In [10]:
from sklearn.linear_model import SGDClassifier

SGDC = SGDClassifier(class_weight='balanced')

# Create the random grid
random_grid = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber", "perceptron"],
                "alpha" : [0.0001, 0.001, 0.01, 0.1],
                "penalty" : ["l2", "l1", "elasticnet", "none"]}

grid = RandomizedSearchCV(SGDC, random_grid, cv=3, random_state=42, refit = True, verbose = 2)


# fitting the model for grid search
grid.fit(X_train, Y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(Y_test, grid_predictions))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END alpha=0.001, loss=squared_hinge, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, loss=squared_hinge, penalty=elasticnet; total time=   0.0s
[CV] END alpha=0.001, loss=squared_hinge, penalty=elasticnet; total time=   0.0s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.0s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.0s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.0s
[CV] END ........alpha=0.001, loss=hinge, penalty=elasticnet; total time=   0.0s
[CV] END ........alpha=0.001, loss=hinge, penalty=elasticnet; total time=   0.0s
[CV] END ........alpha=0.001, loss=hinge, penalty=elasticnet; total time=   0.0s
[CV] END ......alpha=0.001, loss=squared_hinge, penalty=none; total time=   0.0s
[CV] END ......alpha=0.001, loss=squared_hinge, penalty=none; total time=   0.0s
[CV] END ......alpha=0.001, loss=squared_hinge, 




# XGBoost 

In [11]:
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=Y_train 
)

xgb = XGBClassifier()

# Create the random grid
random_grid = { "learning_rate" : [0.05,0.1,0.15,0.20],
                 "n_estimators" : [100, 500, 900, 1100, 1500],
                 "max_depth" : [ 3, 5, 6, 10, 15],
                 "min_child_weight" : [ 1, 3, 5, 7 ],
                 "gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]}

grid = RandomizedSearchCV(xgb, random_grid, cv=3, random_state=42, refit = True, verbose = 2)

# fitting the model for grid search
grid.fit(X_train, Y_train, sample_weight=sample_weights)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions_xgb = grid.predict(X_test)
  
# print classification report
print(classification_report(Y_test, grid_predictions_xgb))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.6s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.2s
[CV] END gamma=0.2, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.3s
[CV] END gamma=0.3, learning_rate=0.15, max_depth=6, min_child_weight=7, n_estimators=1500; total time=   1.0s
[CV] END gamma=0.3, learning_rate=0.15, max_depth=6, min_child_weight=7, n_estimators=1500; total time=   0.9s
[CV] END gamma=0.3, learning_rate=0.15, max_depth=6, min_child_weight=7, n_estimators=1500; total time=   1.0s
[CV] END gamma=0.2, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END gamma=0.2, learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END gamma=0.2, learning_rate=0.05, max_depth=10, min