## Text classification

In [1]:
import pandas as pd
import numpy as np

In [2]:
#pip install -U imbalanced-learn

In [2]:
# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter

In [3]:
real_df = pd.read_csv("real_profile.csv")

In [4]:
scam_df = pd.read_csv("scam_profile.csv")

In [5]:
combined_df = pd.concat([real_df,scam_df], ignore_index = True)

In [6]:
pd.set_option("max_colwidth", None)

In [7]:
combined_df.head(10)


Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y,translated_occupation
0,123canwe,66.0,Retired,single,male,I full fire friskier hell,tr,0,Retired
1,123WILFREDO,28.0,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,es,0,STUDENT
2,1907,48.0,Construction,single,male,laid back earth good sense humor,en,0,Construction
3,52Jim52,70.0,Retired,divorced,male,Retired owner aerospace consuloting firm,da,0,Retired
4,Aaron90,28.0,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,en,0,Social worker
5,abdelghani,71.0,retired,divorced,male,problem free flexibleseek love happiness,da,0,retired
6,Abdul99,54.0,Executive,separated,male,Easy going Highly educated Life beautiful Travels,ro,0,Executive
7,abdulrahmanwaly,33.0,journalist,single,male,name Abdulrahman person ethics religion work field journalism live Cairo,fr,0,journalist
8,Abou,33.0,Designer,single,male,I serious honest trustworthy like share love fan culture new different person,da,0,Designer
9,Accydave,73.0,Retired,divorced,male,My name David Im 71 years old divorced retired 3 grownup children son 2 daughters,tr,0,Retired


In [7]:
# features
X = combined_df['filtered sentence']

# target
y = combined_df['y']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

## Random undersampling and oversampling - only description column

In [None]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)

# Naive Bayes classifier
- Taken from: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [10]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_test)
text_clf = MultinomialNB()

### Evaluating the NB classifier

In [15]:
#oversampling - testing error
text_clf = text_clf.fit(X_train_ros, y_train_ros)
predictions = text_clf.predict(xvalid_tfidf)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#oversampling - training error
predictions = text_clf.predict(X_train_ros)

conf_matrix = confusion_matrix(y_train_ros, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling(testing)
Accuracy: 0.9200470311581422
Precision: 0.8156028368794326
recall: 0.9896729776247849
f1_score: 0.8942457231726284


random oversampling(training)
Accuracy: 0.9364000891067052
Precision: 0.8891537544696066
recall: 0.9971040320784139
f1_score: 0.9400399033917882


In [14]:
#random undersampling - testing error
#text_clf = MultinomialNB()
text_clf = text_clf.fit(X_train_rus, y_train_rus)
predictions = text_clf.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#undersampling - training error
predictions = text_clf.predict(X_train_rus)

conf_matrix = confusion_matrix(y_train_rus, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print('\n')
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling(testing)
Accuracy: 0.9200470311581422
Precision: 0.8156028368794326
recall: 0.9896729776247849
f1_score: 0.8942457231726284


random undersampling(training)
Accuracy: 0.9349892008639309
Precision: 0.8873076923076924
recall: 0.9965442764578833
f1_score: 0.9387589013224822


# SVM classifier

In [17]:
from sklearn.linear_model import SGDClassifier

In [18]:
#random oversampling - testing
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)
text_clf_svm = text_clf_svm.fit(X_train_ros, y_train_ros)
predictions = text_clf_svm.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random oversampling - training
predictions = text_clf_svm.predict(X_train_ros)

conf_matrix = confusion_matrix(y_train_ros, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling(testing)
Accuracy: 0.921222810111699
Precision: 0.8125874125874126
recall: 1.0
f1_score: 0.8966049382716049


random oversampling(training)
Accuracy: 0.9333927378035197
Precision: 0.882445449184195
recall: 1.0
f1_score: 0.9375522138680034


In [19]:
#random undersampling - testing 
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)
text_clf_svm = text_clf_svm.fit(X_train_rus, y_train_rus)
predictions = text_clf_svm.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random undersampling - training
predictions = text_clf_svm.predict(X_train_rus)

conf_matrix = confusion_matrix(y_train_rus, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling(testing)
Accuracy: 0.921222810111699
Precision: 0.8125874125874126
recall: 1.0
f1_score: 0.8966049382716049


random undersampling(training)
Accuracy: 0.9308855291576674
Precision: 0.8785578747628083
recall: 1.0
f1_score: 0.9353535353535354


# Logistic regression

In [21]:
from sklearn.linear_model import LogisticRegression
clf_lg = LogisticRegression()  


### Evaluating logistic regression classifier

In [22]:
#random oversampling - testing
clf_lg = clf_lg.fit(X_train_ros, y_train_ros)
lg_predictions = clf_lg.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random oversampling - training
lg_predictions = clf_lg.predict(X_train_ros)

conf_matrix = confusion_matrix(y_train_ros, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling(testing)
Accuracy: 0.9288653733098178
Precision: 0.8304597701149425
recall: 0.9948364888123924
f1_score: 0.9052466718872357


random oversampling(training)
Accuracy: 0.9477611940298507
Precision: 0.9065533980582524
recall: 0.9984406326576075
f1_score: 0.9502809286547227


In [24]:
#random undersampling - testing
clf_lg = clf_lg.fit(X_train_rus, y_train_rus)
lg_predictions = clf_lg.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random undersampling - training
lg_predictions = clf_lg.predict(X_train_rus)

conf_matrix = confusion_matrix(y_train_rus, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print('\n')
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling(testing)
Accuracy: 0.9218106995884774
Precision: 0.8146067415730337
recall: 0.9982788296041308
f1_score: 0.897138437741686


random undersampling(training)
Accuracy: 0.9384449244060475
Precision: 0.8909861325115562
recall: 0.9991360691144708
f1_score: 0.9419670128283445


# Reading real profile

In [25]:
import json
import os
import glob
import pandas as pd

In [26]:
import json
import os
import glob
import pandas as pd

json_dir = 'data/realprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
realdf = pd.concat(dfs)

#adding a new column based on username column - to match with xxx.jpg when combining image captions
realdf["images"] = realdf["username"] + ".jpg"

#label
realdf["y"] = 1

# Reading scam profile

In [27]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

#modifying the column "images" to join on filename - images = [xxx.jpg], change to xxx.jpg
scamdf["images"] = scamdf["images"].apply(lambda x: ''.join(x))

#label
scamdf["y"] = 0

# Reading image caption json (real)

In [28]:
json_dir = 'final caption model/real_labels'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

fn_list = []
caption_list = []
for file in file_list:
    with open(file) as f:
        data = json.load(f)
        filename = list(data.keys())
        caption = list(data.values())
    fn_list.extend(filename)
    caption_list.extend(caption)

image_real_df = pd.DataFrame(data = {"filename":fn_list,"caption":caption_list})

#drop duplicates 
image_real_df = image_real_df.drop_duplicates(subset = "filename")

# Reading image caption json (scam)

In [29]:
json_dir = 'final caption model/scam_labels'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

fn_list = []
caption_list = []
for file in file_list:
    with open(file) as f:
        data = json.load(f)
        filename = list(data.keys())
        caption = list(data.values())
    fn_list.extend(filename)
    caption_list.extend(caption)

image_scam_df = pd.DataFrame(data = {"filename":fn_list,"caption":caption_list})

In [30]:
##concat scam and real profiles
scam_and_real = pd.concat([scamdf[["username","images","y"]], realdf[["username","images","y"]]], ignore_index=True)
#scam_and_real.info()

In [31]:
#combine image captions + filename (real and scam) into one df
scam_image_and_real_image = pd.concat([image_real_df,image_scam_df],ignore_index = True)
scam_image_and_real_image.rename(columns = {"filename":"images"},inplace = True)

In [32]:
#left join scam image and real image captions to scam_and_real
final_df = scam_and_real.merge(scam_image_and_real_image,how = 'left', on = 'images')

In [33]:
# combine finaldf with combined df
final_df3 = combined_df.merge(final_df[["username","caption"]],how = "left",on = "username")
final_df3.dropna(subset = ["caption","username"],inplace = True)


In [17]:
final_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4607 entries, 0 to 8504
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   username               4607 non-null   object 
 1   age                    4606 non-null   float64
 2   occupation             4604 non-null   object 
 3   status                 4607 non-null   object 
 4   gender                 4607 non-null   object 
 5   filtered sentence      4607 non-null   object 
 6   Language               4607 non-null   object 
 7   y                      4607 non-null   int64  
 8   translated_occupation  3192 non-null   object 
 9   caption                4607 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 395.9+ KB


# Image captions

In [34]:
#use final df3 - create model with both description + caption 
# features
X = final_df3['caption']

# target
y = final_df3['y']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [36]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_test)

# Undersampling and oversampling for image captions

In [37]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)

# Naive Bayes classifier for image captions

In [57]:
text_clf = MultinomialNB()

In [58]:
#random oversampling - testing
text_clf = text_clf.fit(X_train_ros, y_train_ros)
predictions = text_clf.predict(xvalid_tfidf)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random oversampling - training
predictions = text_clf.predict(X_train_ros)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_train_ros, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling(testing)
Accuracy: 0.49783080260303686
Precision: 0.28728070175438597
recall: 0.48698884758364314
f1_score: 0.3613793103448276


random oversampling(training)
Accuracy: 0.778041074249605
Precision: 0.7312746386333772
recall: 0.8791469194312796
f1_score: 0.7984218077474893


In [59]:
#random undersampling - testing 
text_clf = text_clf.fit(X_train_rus, y_train_rus)
predictions = text_clf.predict(xvalid_tfidf)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random undersampling - training
predictions = text_clf.predict(X_train_rus)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_train_rus, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling(testing)
Accuracy: 0.4924078091106291
Precision: 0.28322440087145967
recall: 0.483271375464684
f1_score: 0.3571428571428571


random undersampling(training)
Accuracy: 0.7688638334778838
Precision: 0.7227011494252874
recall: 0.8725065047701648
f1_score: 0.7905697445972495


# svm classifier

In [43]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)

#random oversampling - testing
text_clf_svm = text_clf_svm.fit(X_train_ros, y_train_ros)
predictions = text_clf_svm.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random oversampling - training
predictions = text_clf_svm.predict(X_train_ros)

conf_matrix = confusion_matrix(y_train_ros, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)


random oversampling(testing)
Accuracy: 0.7114967462039046
Precision: 0.4957983193277311
recall: 0.9007633587786259
f1_score: 0.6395663956639566


random oversampling(training)
Accuracy: 0.7946287519747235
Precision: 0.7353312302839117
recall: 0.9206161137440758
f1_score: 0.8176078568923184


In [45]:
#random undersampling
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)
text_clf_svm = text_clf_svm.fit(X_train_rus, y_train_rus)
predictions = text_clf_svm.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random undersampling - training
predictions = text_clf_svm.predict(X_train_rus)

conf_matrix = confusion_matrix(y_train_rus, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling(testing)
Accuracy: 0.7158351409978309
Precision: 0.5
recall: 0.8931297709923665
f1_score: 0.6410958904109589


random undersampling(training)
Accuracy: 0.7931483087597572
Precision: 0.7343966712898752
recall: 0.9184735472679966
f1_score: 0.8161849710982658


# logistic regression classifier

In [46]:
from sklearn.linear_model import LogisticRegression
clf_lg = LogisticRegression()  

In [47]:
#random oversampling - testing
clf_lg = clf_lg.fit(X_train_ros, y_train_ros)
lg_predictions = clf_lg.predict(xvalid_tfidf)
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random oversampling - training
lg_predictions = clf_lg.predict(X_train_ros)
conf_matrix = confusion_matrix(y_train_ros, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random oversampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling(testing)
Accuracy: 0.7190889370932755
Precision: 0.5033407572383074
recall: 0.8625954198473282
f1_score: 0.6357243319268637


random oversampling(training)
Accuracy: 0.7977883096366508
Precision: 0.7495036399735274
recall: 0.8945497630331753
f1_score: 0.8156283759452647


In [49]:
#random undersampling - testing 
clf_lg = clf_lg.fit(X_train_rus, y_train_rus)
lg_predictions = clf_lg.predict(xvalid_tfidf)

conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling(testing)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

#random undersampling - training
lg_predictions = clf_lg.predict(X_train_rus)

conf_matrix = confusion_matrix(y_train_rus, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("\n")
print("random undersampling(training)")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)


random undersampling(testing)
Accuracy: 0.7277657266811279
Precision: 0.512249443207127
recall: 0.8778625954198473
f1_score: 0.6469760900140648


random undersampling(training)
Accuracy: 0.792714657415438
Precision: 0.7429805615550756
recall: 0.8950563746747615
f1_score: 0.8119590873328087


# Combining features into one model

In [50]:
df1 = pd.read_csv('final_data_cleaned.csv')
final_df4 = final_df3.merge(df1[["username","translated_occupation_category"]],how = 'left', on = 'username')

In [51]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
final_df4[['status','gender']]=final_df4[['status','gender']].apply(labelencoder.fit_transform)
final_df4[['translated_occupation_category_encoded']]=final_df4[['translated_occupation_category']].apply(labelencoder.fit_transform)
final_df4['age_group']= pd.qcut(final_df4['age'],4)
final_df4['age_group_encoded'] = labelencoder.fit_transform(final_df4['age_group'])


In [52]:
final_df5 = final_df4.drop(columns = ["age","occupation","Language","translated_occupation","translated_occupation_category","age_group"])
final_df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4608 entries, 0 to 4607
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   username                                4608 non-null   object
 1   status                                  4608 non-null   int64 
 2   gender                                  4608 non-null   int64 
 3   filtered sentence                       4608 non-null   object
 4   y                                       4608 non-null   int64 
 5   caption                                 4608 non-null   object
 6   translated_occupation_category_encoded  4608 non-null   int64 
 7   age_group_encoded                       4608 non-null   int64 
dtypes: int64(5), object(3)
memory usage: 324.0+ KB


In [53]:
final_df5 = final_df5[["username","status","gender","translated_occupation_category_encoded","age_group_encoded","filtered sentence","caption","y"]].copy()
final_df5.head()

Unnamed: 0,username,status,gender,translated_occupation_category_encoded,age_group_encoded,filtered sentence,caption,y
0,123canwe,14,1,12,3,I full fire friskier hell,man in red shirt is standing on bench next to the cameraman,0
1,123WILFREDO,14,1,2,0,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,man in red shirt is sitting on the edge of river,0
2,52Jim52,10,1,12,3,Retired owner aerospace consuloting firm,man in black shirt is standing in front of wooden bench,0
3,Aaron90,14,1,10,0,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,two girls are playing in the grass,0
4,Abou,14,1,0,1,I serious honest trustworthy like share love fan culture new different person,man in red shirt and black shorts is walking down the street,0


In [79]:
#final_df5.to_csv('all_combined_features.csv')

In [63]:
# features
X = final_df5[["age_group_encoded","status","gender","translated_occupation_category_encoded","filtered sentence","caption"]]

# target
y = final_df5['y']

In [62]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

def print_results(tp,tn,fp,fn,a):
    accuracy = (tp + tn)/(tp + tn + fn + fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2*(precision*recall)/(precision + recall)

    print(a)
    print("Accuracy:",accuracy)
    print("Precision:",precision)
    print("recall:",recall)
    print("f1_score:",f1_score)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

In [65]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

preprocess = ColumnTransformer(
    [('caption_tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000),'caption'),
     ('filtered sentence tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000),'filtered sentence')
     ],
    remainder='passthrough')

model = make_pipeline(
    preprocess,
    MultinomialNB() )

In [66]:
rus = RandomUnderSampler(random_state=424)
X_train_rus, y_train_rus= rus.fit_resample(X_train, y_train)
model.fit(X_train_rus, y_train_rus)

y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train_rus)

#for training data - undersampling
from sklearn.metrics import confusion_matrix
conf_matrix2 = confusion_matrix(y_train_rus,y_train_pred)
tn = conf_matrix2[0][0]
fn = conf_matrix2[1][0]
tp = conf_matrix2[1][1]
fp = conf_matrix2[0][1]

print_results(tp,tn,fp,fn,"random undersampling(training)")
print('\n')
#for testing data - undersampling
conf_matrix = confusion_matrix(y_test, y_pred)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

print_results(tp,tn,fp,fn,"random undersampling(testing)")


random undersampling(training)
Accuracy: 0.9808027923211169
Precision: 0.9867491166077739
recall: 0.974694589877836
f1_score: 0.9806848112379281


random undersampling(testing)
Accuracy: 0.9848156182212582
Precision: 0.967032967032967
recall: 0.9814126394052045
f1_score: 0.974169741697417


In [67]:
ros = RandomOverSampler(random_state = 424)
X_train_ros, y_train_ros= ros.fit_resample(X_train, y_train)
model.fit(X_train_ros,y_train_ros)

y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train_ros)
#for training data - oversampling
from sklearn.metrics import confusion_matrix
conf_matrix2 = confusion_matrix(y_train_ros,y_train_pred)
tn = conf_matrix2[0][0]
fn = conf_matrix2[1][0]
tp = conf_matrix2[1][1]
fp = conf_matrix2[0][1]

print_results(tp,tn,fp,fn,"random oversampling(training)")
print('\n')

#for testing data - oversampling
conf_matrix = confusion_matrix(y_test, y_pred)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

print_results(tp,tn,fp,fn,"random oversampling(testing)")

random oversampling(training)
Accuracy: 0.9836614173228346
Precision: 0.9774582199766809
recall: 0.9901574803149606
f1_score: 0.9837668687658909


random oversampling(testing)
Accuracy: 0.9772234273318872
Precision: 0.9335664335664335
recall: 0.9925650557620818
f1_score: 0.9621621621621621
