## Text classification

In [1]:
import pandas as pd
import numpy as np

In [7]:
combined_df = pd.read_csv("combined_profile.csv")

In [8]:
combined_df.head()

Unnamed: 0,username,age,l_occupation,status,gender,filtered sentence,y
0,williamgeorge,44,doctoral student,single,male,Tired Millennial outlook I see around Is woman...,0
1,lion10x,59,legal services,divorced,male,I good man healthy honest high educated always...,0
2,Luistroy,42,construction,single,male,Im 56 weigh 140 black hair brown eyes slim loo...,0
3,Jonjon46,49,landscaper/cook an baker jack of all trades,single,male,Im laid back easy going guy funny love cars mu...,0
4,vitor78,43,businessman,divorced,male,I simple person really loves live I nature lov...,0


In [11]:
# features
X = combined_df[['filtered sentence']]

# target
y = combined_df['y']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

In [None]:
# label encode the target variable 
#encoder = preprocessing.LabelEncoder()
#y_train = encoder.fit_transform(y_train)
#y_test = encoder.fit_transform(y_test)

# Naive Bayes classifier
- Taken from: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [13]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])
text_clf = text_clf.fit(X_train, y_train)

In [19]:
predictions = text_clf.predict(X_test)

### Evaluating the NB classifier

In [22]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

Accuracy: 0.8718428437792329
Precision: 0.7211238293444329
recall: 0.9914163090128756
f1_score: 0.8349397590361446


# SVM classifier

In [24]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),
])

text_clf_svm = text_clf_svm.fit(X_train, y_train)

In [25]:
predictions = text_clf_svm.predict(X_test)

### Evaluating the SVM classifier

In [26]:
conf_matrix = confusion_matrix(y_test, predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

Accuracy: 0.8699719363891487
Precision: 0.7158974358974359
recall: 0.9985693848354793
f1_score: 0.8339307048984469


# Logistic regression

In [29]:
from sklearn.linear_model import LogisticRegression
# Train model
clf_lg = LogisticRegression()  
clf_lg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',LogisticRegression()),
])
clf_lg = clf_lg.fit(X_train, y_train) 

In [30]:
lg_predictions = clf_lg.predict(X_test)

### Evaluating logistic regression classifier

In [31]:
conf_matrix = confusion_matrix(y_test, lg_predictions)
tn = conf_matrix[0][0]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]
fp = conf_matrix[0][1]

accuracy = (tp + tn)/(tp + tn + fn + fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2*(precision*recall)/(precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("recall:",recall)
print("f1_score:",f1_score)

Accuracy: 0.9106641721234799
Precision: 0.7995283018867925
recall: 0.9699570815450643
f1_score: 0.8765352294764058
