### Importing libraries


In [1]:
# importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_extraction.text import TfidfVectorizer

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### loading the dataset

In [None]:
train_set=pd.read_csv('/content/drive/MyDrive/ArabicSentimentHager/paper1/TF-IDF/dataset2/train_datasetset2.csv',encoding='utf-8')
test_set=pd.read_csv("/content/drive/MyDrive/ArabicSentimentHager/paper1/TF-IDF/dataset2/unseen_dataset2.csv")
train_set.head()

In [None]:
X_train=train_set.drop(columns =['label'], axis = 1)
y_train=train_set['label']
X_train.shape, y_train.shape

In [None]:
y_test=test_set['label']
X_test=test_set.drop(columns =['label'], axis = 1)
X_test.shape, y_test.shape

### LabelEncoder

In [None]:
encoder_y = LabelEncoder()
y_train= encoder_y.fit_transform(y_train)
y_test = encoder_y.transform(y_test)

### TF-IDF feature extraction method

In [None]:
tfidf_vect = TfidfVectorizer(max_features=5000 , ngram_range=(1,1))
tfidf_train = tfidf_vect.fit_transform(X_train['text'])
tfidf_test = tfidf_vect.transform(X_test['text'])

### Finding best model and hyper parameters

In [None]:
model_params = {
    
    'RF': {
        'model': RandomForestClassifier(),
        'params' :   { 'max_depth': [100,200],
        'criterion': ['gini', 'entropy'],
        }
    },
    'LR' : {
        'model': LogisticRegression(solver='sag'),
        'params': { "C":np.logspace(-3,3,7), "penalty":['none', 'l1', 'l2', 'elasticnet'],
              "solver" : ['newton-cg','lbfgs','liblinear','sag','saga'],
    'class_weight':['balanced', None]
             }}
    ,
    'DT': {
        'model': DecisionTreeClassifier(),
        'params': {'max_depth':[100,200],
                
             'max_features': [1, 2, 3, 4,5,6,7,8,9,10],
             'criterion': ['gini', 'entropy'],
             
        }
    }  
    ,
    'NB': {
        'model':  MultinomialNB(),
        'params': {'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  }
    } ,
     'KNN': {
     'model': KNeighborsClassifier(),
    'params':  {'n_neighbors':[50,60,70,100],'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute']}
    }


}

In [None]:
scores = []
model={}
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=10, scoring = 'accuracy',return_train_score=False)
    clf.fit(tfidf_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_grid=clf.fit(tfidf_train, y_train)
    model[model_name]=best_grid.best_estimator_
df = pd.DataFrame(scores,columns=['model','best_params'])


### Print best values of parameters for each model

In [None]:
df

### Geting cross-validation result for cross-validation and testing result for unseen set

In [None]:
scoring=['accuracy','roc_auc','recall','f1','precision']

In [None]:
best_model={'RF':model['RF'],'LR':model['LR'],'DT':model['DT'],
            'NB':model['NB'],'KNN':model['KNN']}

In [None]:
finaltrain = pd.DataFrame([])
ReultofTest=pd.DataFrame([])
for model_name, i in best_model.items():   
    Model=i
    AccuracyTrain=[]
    PrecisionTrain=[]
    RecallTrain=[]
    F1Train=[]

    AccuracyTest=[]
    PrecisionTest=[]
    RecallTest=[]
    F1Test=[]

    for i in range(0,1):
        if i==0:
            r=70
        elif i== 1:
            r=80
   

        SFold = StratifiedKFold(n_splits=10,  shuffle=True,random_state=r)
        Train_Score = cross_validate(Model, tfidf_train, y_train, scoring=scoring,  cv=SFold)
        AccuracyTrain.append(round(100*np.mean(Train_Score['test_accuracy']), 2))
        PrecisionTrain.append(round(100*np.mean(Train_Score['test_precision']), 2))
        RecallTrain.append(round(100*np.mean(Train_Score['test_recall']), 2))
        F1Train.append(round(100*np.mean(Train_Score['test_f1']), 2))
        
        y_pred = cross_val_predict(Model, tfidf_test, y_test, cv=SFold )
        Accurcy_Test= accuracy_score(y_test,y_pred)
        Precision_Test=precision_score(y_test, y_pred, average='weighted')
        Recall_Test=recall_score(y_test, y_pred, average='weighted')
        F1_Test=f1_score(y_test, y_pred, average='weighted') 
        RocTest=roc_auc_score(y_test, y_pred, average='weighted')
        AccuracyTest.append(round(100*Accurcy_Test, 2))
        PrecisionTest.append(round(100*Precision_Test, 2))
        RecallTest.append(round(100*Recall_Test, 2))
        F1Test.append(round(100*F1_Test, 2)) 
 
        finaltrain = finaltrain.append({ 'model':model_name,
                                        'AccuracyTrain' : round(np.mean(AccuracyTrain),2),
                                        'PrecisionTrain':round(np.mean(PrecisionTrain),2),
                                        'RecallTrain':round(np.mean(RecallTrain),2),  
                                        'F1Train':round(np.mean(F1Train),2),
                                        
                                       } , ignore_index=True)
        finaltrain.reindex(['model','AccuracyTrain','PrecisionTrain','RecallTrain','F1Train'], axis=1)
        
       
        ReultofTest=ReultofTest.append({'model':model_name,
                                        'AccuracyTest' : round(np.mean(AccuracyTest),2),
                                        'PrecisionTest':round(np.mean(PrecisionTest),2),
                                        'RecallTest' : round(np.mean(RecallTest),2),'F1Test':round(np.mean(F1Test),2),
                                       
                                       }, ignore_index=True)
        ReultofTest.reindex(['model','AccuracyTest','PrecisionTest','RecallTest','F1Test'],axis=1)
       

### Print avarage of training set and testing set 

In [None]:
with pd.option_context('expand_frame_repr', False):
    print (finaltrain)


In [None]:
with pd.option_context('expand_frame_repr', False):
    print (ReultofTest)


### Print ROC curve of models

In [None]:

result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

# Train the models and record the results
for k,v in best_model.items():
   
    #yproba = cls.predict(tfidf_test)
    y_pred = cross_val_predict(v, tfidf_test, y_test, cv=SFold )
    fpr, tpr, _ = roc_curve(y_test,  y_pred)
    auc = round(roc_auc_score(y_test, y_pred),5)*100
    
    result_table = result_table.append({'classifiers':k,
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc, 
                                        'y_pred': y_pred}, ignore_index=True)

# Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)


In [None]:
fig = plt.figure(figsize=(10,8))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()