## Text classification

In [1]:
import pandas as pd
import numpy as np

In [28]:
#pip install -U imbalanced-learn

In [2]:
# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter

In [3]:
real_df = pd.read_csv("real_profile.csv")

In [4]:
scam_df = pd.read_csv("scam_profile.csv")

In [7]:
combined_df = pd.concat([real_df,scam_df], ignore_index = True)

In [8]:
pd.set_option("max_colwidth", None)

In [9]:
combined_df.head(10)


Unnamed: 0,username,age,occupation,status,gender,filtered sentence,Language,y,translated_occupation
0,123canwe,66.0,Retired,single,male,I full fire friskier hell,tr,0,Retired
1,123WILFREDO,28.0,ESTUDIANTE,single,male,ME GUSTA CONOCER CHICAS BONDADOSAS CARIÑOSAS Y MUY FUERTES EN LA MORAL,es,0,STUDENT
2,1907,48.0,Construction,single,male,laid back earth good sense humor,en,0,Construction
3,52Jim52,70.0,Retired,divorced,male,Retired owner aerospace consuloting firm,da,0,Retired
4,Aaron90,28.0,Social worker,single,male,Hello aaron Vermont I vermont 2 years Im really fun person I like camping bone fire etc,en,0,Social worker
5,abdelghani,71.0,retired,divorced,male,problem free flexibleseek love happiness,da,0,retired
6,Abdul99,54.0,Executive,separated,male,Easy going Highly educated Life beautiful Travels,ro,0,Executive
7,abdulrahmanwaly,33.0,journalist,single,male,name Abdulrahman person ethics religion work field journalism live Cairo,fr,0,journalist
8,Abou,33.0,Designer,single,male,I serious honest trustworthy like share love fan culture new different person,da,0,Designer
9,Accydave,73.0,Retired,divorced,male,My name David Im 71 years old divorced retired 3 grownup children son 2 daughters,tr,0,Retired


In [10]:
# features
X = combined_df['filtered sentence']

# target
y = combined_df['y']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

# output for slides

In [12]:
# word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
#tfidf_vect.fit(X)
#xtrain_tfidf =  tfidf_vect.transform(X_train)
#xvalid_tfidf =  tfidf_vect.transform(X_test)

In [13]:
#XX = combined_df['filtered sentence'].iloc[2:5]

In [14]:
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
#tfidf_vect.fit(XX)
#xtrain_tfidf =  tfidf_vect.transform(XX)


In [15]:
#print(tfidf_vect.get_feature_names())  
#print('\n')
#print(xtrain_tfidf.shape)
#print('\n')
#print(xtrain_tfidf)

In [16]:
#print(xtrain_tfidf.toarray())


# Naive Bayes classifier
- Taken from: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [17]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#WITHOUT OVERSAMPLING/UNDER SAMPLING
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])
text_clf = text_clf.fit(X_train, y_train)

In [19]:
predictions = text_clf.predict(X_test)

### Evaluating the NB classifier

In [20]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("original dataset")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

original dataset
Accuracy: 0.9188712522045855
Precision: 0.8150782361308677
recall: 0.9862306368330465
f1_score: 0.8925233644859812


In [21]:
#random oversampling
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_test)

text_clf = MultinomialNB()
ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
text_clf = text_clf.fit(X_train_ros, y_train_ros)
predictions = text_clf.predict(xvalid_tfidf)

In [22]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling
Accuracy: 0.9200470311581422
Precision: 0.8156028368794326
recall: 0.9896729776247849
f1_score: 0.8942457231726284


In [23]:
#random undersampling
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
text_clf = MultinomialNB()
text_clf = text_clf.fit(X_train_rus, y_train_rus)
predictions = text_clf.predict(xvalid_tfidf)

In [24]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling
Accuracy: 0.9200470311581422
Precision: 0.8156028368794326
recall: 0.9896729776247849
f1_score: 0.8942457231726284


# SVM classifier

In [26]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),
])

text_clf_svm = text_clf_svm.fit(X_train, y_train)

In [74]:
predictions = text_clf_svm.predict(X_test)

### Evaluating the SVM classifier

In [75]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

Accuracy: 0.9227166276346604
Precision: 0.8228187919463087
recall: 1.0
f1_score: 0.9027982326951398


In [27]:
#random oversampling
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)

ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
text_clf_svm = text_clf_svm.fit(X_train_ros, y_train_ros)
predictions = text_clf_svm.predict(xvalid_tfidf)

In [28]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

oversampling
Accuracy: 0.921222810111699
Precision: 0.8125874125874126
recall: 1.0
f1_score: 0.8966049382716049


In [29]:
#random undersampling
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
text_clf_svm = text_clf_svm.fit(X_train_rus, y_train_rus)
predictions = text_clf_svm.predict(xvalid_tfidf)

In [30]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

undersampling
Accuracy: 0.921222810111699
Precision: 0.8125874125874126
recall: 1.0
f1_score: 0.8966049382716049


# Logistic regression

In [31]:
from sklearn.linear_model import LogisticRegression
# Train model
clf_lg = LogisticRegression()  
clf_lg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',LogisticRegression()),
])
clf_lg = clf_lg.fit(X_train, y_train) 

In [33]:
lg_predictions = clf_lg.predict(X_test)

### Evaluating logistic regression classifier

In [34]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

Accuracy: 0.932392710170488
Precision: 0.8498498498498499
recall: 0.9741824440619621
f1_score: 0.9077786688051322


In [35]:
#random oversampling
clf_lg = LogisticRegression()  

ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
clf_lg = clf_lg.fit(X_train_ros, y_train_ros)
lg_predictions = clf_lg.predict(xvalid_tfidf)

In [36]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

oversampling
Accuracy: 0.9288653733098178
Precision: 0.8304597701149425
recall: 0.9948364888123924
f1_score: 0.9052466718872357


In [37]:
#random undersampling
clf_lg = LogisticRegression()  

ros = RandomUnderSampler(random_state=42)

X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
clf_lg = clf_lg.fit(X_train_rus, y_train_rus)
lg_predictions = clf_lg.predict(xvalid_tfidf)

In [38]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

undersampling
Accuracy: 0.9218106995884774
Precision: 0.8146067415730337
recall: 0.9982788296041308
f1_score: 0.897138437741686


# Reading real profile

In [39]:
import json
import os
import glob
import pandas as pd

In [40]:
import json
import os
import glob
import pandas as pd

json_dir = 'data/realprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
realdf = pd.concat(dfs)

#adding a new column based on username column - to match with xxx.jpg when combining image captions
realdf["images"] = realdf["username"] + ".jpg"

#label
realdf["y"] = 1

# Reading scam profile

In [41]:
json_dir = 'data/scamprofile'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
        json_data['site'] = file.rsplit("/", 1)[-1]
    dfs.append(json_data)
scamdf = pd.concat(dfs)

#modifying the column "images" to join on filename - images = [xxx.jpg], change to xxx.jpg
scamdf["images"] = scamdf["images"].apply(lambda x: ''.join(x))

#label
scamdf["y"] = 0

# Reading image caption json (real)

In [66]:
json_dir = 'final caption model/real_labels'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

fn_list = []
caption_list = []
for file in file_list:
    with open(file) as f:
        data = json.load(f)
        filename = list(data.keys())
        caption = list(data.values())
    fn_list.extend(filename)
    caption_list.extend(caption)

image_real_df = pd.DataFrame(data = {"filename":fn_list,"caption":caption_list})

#drop duplicates 
image_real_df = image_real_df.drop_duplicates(subset = "filename")

# Reading image caption json (scam)

In [68]:
json_dir = 'final caption model/scam_labels'
json_pattern = os.path.join(json_dir, '*.json')
file_list = glob.glob(json_pattern)

fn_list = []
caption_list = []
for file in file_list:
    with open(file) as f:
        data = json.load(f)
        filename = list(data.keys())
        caption = list(data.values())
    fn_list.extend(filename)
    caption_list.extend(caption)

image_scam_df = pd.DataFrame(data = {"filename":fn_list,"caption":caption_list})

In [95]:
##concat scam and real profiles
scam_and_real = pd.concat([scamdf[["username","images","y"]], realdf[["username","images","y"]]], ignore_index=True)
#scam_and_real.info()

In [96]:
#combine image captions + filename (real and scam) into one df
scam_image_and_real_image = pd.concat([image_real_df,image_scam_df],ignore_index = True)
scam_image_and_real_image.rename(columns = {"filename":"images"},inplace = True)

In [109]:
#left join scam image and real image captions to scam_and_real
final_df = scam_and_real.merge(scam_image_and_real_image,how = 'left', on = 'images')

In [114]:
# combine finaldf with combined df
final_df3 = combined_df.merge(final_df[["username","caption"]],how = "left",on = "username")
final_df3.dropna(subset = ["caption","username"],inplace = True)


In [117]:
final_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4607 entries, 0 to 8504
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   username               4607 non-null   object 
 1   age                    4606 non-null   float64
 2   occupation             4604 non-null   object 
 3   status                 4607 non-null   object 
 4   gender                 4607 non-null   object 
 5   filtered sentence      4607 non-null   object 
 6   Language               4607 non-null   object 
 7   y                      4607 non-null   int64  
 8   translated_occupation  3192 non-null   object 
 9   caption                4607 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 395.9+ KB


In [133]:
#use final df3 - create model with both description + caption 
# features
X = final_df3['caption']

# target
y = final_df3['y']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

In [134]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [135]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_test)

# Naive Bayes classifier for image captions

In [136]:
#random oversampling
text_clf = MultinomialNB()
ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
text_clf = text_clf.fit(X_train_ros, y_train_ros)
predictions = text_clf.predict(xvalid_tfidf)

In [137]:
#evaluating the naive bayes classifier - oversampling
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling
Accuracy: 0.702819956616052
Precision: 0.4868421052631579
recall: 0.8473282442748091
f1_score: 0.6183844011142061


In [138]:
#random undersampling
text_clf = MultinomialNB()
rus = RandomUnderSampler(random_state=42)

X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
text_clf = text_clf.fit(X_train_rus, y_train_rus)
predictions = text_clf.predict(xvalid_tfidf)

In [139]:
#evaluating the naive bayes classifier - undersampling
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling
Accuracy: 0.7017353579175705
Precision: 0.485838779956427
recall: 0.851145038167939
f1_score: 0.6185852981969486


# svm classifier

In [140]:
from sklearn.linear_model import SGDClassifier

#random oversampling
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)

ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
text_clf_svm = text_clf_svm.fit(X_train_ros, y_train_ros)
predictions = text_clf_svm.predict(xvalid_tfidf)

In [141]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling
Accuracy: 0.7114967462039046
Precision: 0.4957983193277311
recall: 0.9007633587786259
f1_score: 0.6395663956639566


In [142]:
#random undersampling
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)

rus = RandomUnderSampler(random_state=42)

X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
text_clf_svm = text_clf_svm.fit(X_train_rus, y_train_rus)
predictions = text_clf_svm.predict(xvalid_tfidf)

In [143]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling
Accuracy: 0.7158351409978309
Precision: 0.5
recall: 0.8931297709923665
f1_score: 0.6410958904109589


# logistic regression classifier

In [129]:
from sklearn.linear_model import LogisticRegression
#random oversampling
clf_lg = LogisticRegression()  

ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros= ros.fit_resample(xtrain_tfidf, y_train)
clf_lg = clf_lg.fit(X_train_ros, y_train_ros)
lg_predictions = clf_lg.predict(xvalid_tfidf)

In [130]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random oversampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random oversampling
Accuracy: 0.7190889370932755
Precision: 0.5033407572383074
recall: 0.8625954198473282
f1_score: 0.6357243319268637


In [131]:
#random undersampling
clf_lg = LogisticRegression()  

rus = RandomUnderSampler(random_state=42)

X_train_rus, y_train_rus= rus.fit_resample(xtrain_tfidf, y_train)
clf_lg = clf_lg.fit(X_train_rus, y_train_rus)
lg_predictions = clf_lg.predict(xvalid_tfidf)

In [132]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("random undersampling")
print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

random undersampling
Accuracy: 0.7277657266811279
Precision: 0.512249443207127
recall: 0.8778625954198473
f1_score: 0.6469760900140648
