In [1]:
import pandas as pd
from sklearn.feature_extraction import text

df = pd.read_csv("./labelled_posts.csv")

# Drop reposts (i.e. engagement & comments = 0)
no_reposts_df = df[(df['engagement'] != 0) | (df['comments'] != 0)].copy()

# Lowercase all words
no_reposts_df.loc[:, "content"] = no_reposts_df["content"].apply(lambda x : str.lower(x))


texts, labels = no_reposts_df["content"], no_reposts_df["personal_exp"]


In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=21)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer: Convert a collection of raw documents to a matrix of TF-IDF features
# TfidfVectorizer weights the word counts by a measure of how often they appear in the documents
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Logistic Regression model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = logistic_regression.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7012987012987013
Confusion Matrix:
 [[54  0]
 [23  0]]
Classification Report:
               precision    recall  f1-score   support

       False       0.70      1.00      0.82        54
        True       0.00      0.00      0.00        23

    accuracy                           0.70        77
   macro avg       0.35      0.50      0.41        77
weighted avg       0.49      0.70      0.58        77



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Multinomial Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = naive_bayes.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7012987012987013
Confusion Matrix:
 [[54  0]
 [23  0]]
Classification Report:
               precision    recall  f1-score   support

       False       0.70      1.00      0.82        54
        True       0.00      0.00      0.00        23

    accuracy                           0.70        77
   macro avg       0.35      0.50      0.41        77
weighted avg       0.49      0.70      0.58        77



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# https://www.oreilly.com/library/view/natural-language-processing/9781491978221/ch04.html