# Bag of words baseline model

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


In [4]:
post_type = 'post_travel'
data_path = '../data/split/train.csv'
test_data_path = '../data/split/test.csv'
narcism_type = 'adm'

In [5]:
# Read the data
data = pd.read_csv(data_path)
test_data = pd.read_csv(test_data_path)
data.head()

Unnamed: 0,post_travel,post_abortion,adm,riv,gender,gender_3_text,age,ethnic_background,ethnic_background_8_text,education,...,marital_status,twitter,none,facebook,instagram,tiktok,linkedin,pinterest,other,other_portals_7_text
0,I wish I could travel 24/7 and get paid for it,"This is a horrible time to be alive, when wome...",1.444,1.111,2.0,,33,1.0,,2.0,...,4.0,6.0,,1.0,1.0,1.0,1.0,,,
1,Vacations are pricey these days but so worth i...,Safe sex will always be the best option. The g...,3.889,1.111,1.0,,27,8.0,Black African,2.0,...,5.0,1.0,,1.0,1.0,1.0,1.0,,,
2,I recently visited beautiful Stratford upon Av...,I am very strongly apposed against the abortio...,3.444,2.667,2.0,,41,1.0,,1.0,...,5.0,1.0,,1.0,1.0,1.0,1.0,1.0,,
3,I have just visited Marrakesh.The scenery is l...,Abortion is an emotive subject but a total ban...,3.667,2.889,1.0,,65,1.0,,5.0,...,1.0,2.0,1.0,,,,,,,
4,"I travel a lot for work, and I get to see all ...",This is fucking stupid and scary. Restricting ...,1.222,1.222,1.0,,30,1.0,,4.0,...,5.0,2.0,,1.0,,,,,1.0,reddit


In [32]:
# Create a pipeline that includes vectorization and transformation
preprocessing_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

tfidf = preprocessing_pipeline.fit_transform(data[post_type].values)
df_counts = pd.DataFrame(tfidf.toarray())
df_counts['narcissism'] = data[narcism_type]

df_counts.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,891,892,893,894,895,896,897,898,899,narcissism
0,0.0,0.0,0.0,0.0,0.0,0.475189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.444
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.120431,0.0,0.0,0.0,0.0,3.889
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.444
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.18467,0.0,0.0,0.0,0.0,3.667
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.222


In [34]:
# Transform the test data
tfidf = preprocessing_pipeline.transform(test_data[post_type].values)
test_counts = pd.DataFrame(tfidf.toarray())
test_counts['narcissism'] = test_data[narcism_type]

test_counts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,891,892,893,894,895,896,897,898,899,narcissism
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.667
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.111
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.222
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.889


## Compare different models

In [35]:
# Load split data
X_train = df_counts.iloc[:, :-1]
y_train = np.ravel(df_counts[['narcissism']])
X_test = test_counts.iloc[:, :-1]
y_test = np.ravel(test_counts[['narcissism']])

In [36]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def predict(model, X_test):
    return model.predict(X_test)

def evaluate(y_test, y_pred):
    return mean_squared_error(y_test, y_pred)

In [42]:
lr_model = LinearRegression()
train_model(lr_model, X_train, y_train)
y_pred = predict(lr_model, X_test)
mse = evaluate(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 0.7768377376883937


In [44]:
mlpr_model = MLPRegressor(max_iter=800)
train_model(mlpr_model, X_train, y_train)
y_pred = predict(mlpr_model, X_test)
mse = evaluate(y_test, y_pred)
print(f"MLP Regressor MSE: {mse}")

MLP Regressor MSE: 0.9559342377939227


In [45]:
svr_model = SVR()
train_model(svr_model, X_train, y_train)
y_pred = predict(svr_model, X_test)
mse = evaluate(y_test, y_pred)
print(f"SVR MSE: {mse}")

SVR MSE: 0.6405744977662071


In [46]:
rfr_model = RandomForestRegressor()
train_model(rfr_model, X_train, y_train)
y_pred = predict(rfr_model, X_test)
mse = evaluate(y_test, y_pred)
print(f"Random Forest MSE: {mse}")

Random Forest MSE: 0.6826325569326087


In [47]:
dtr_model = DecisionTreeRegressor()
train_model(dtr_model, X_train, y_train)
y_pred = predict(dtr_model, X_test)
mse = evaluate(y_test, y_pred)
print(f"Decision Tree MSE: {mse}")

Decision Tree MSE: 1.3985342391304345


So far the best effects give SVM and Random Forest, but also the simples solution has rather small MSE - LinearRegression

## Testing on some new data
Generated by chat GPT 3.5 
Prompt: *"Write me a travel post for twitter"*

In [58]:
# Choosing the best model based on the MSE
best_model = svr_model

In [60]:
# Won't work with the current data

new_data = ["Embarking on an exhilarating adventure through the enchanting streets of Kyoto, Japan. 🎌 From the serene bamboo forests of Arashiyama to the historic temples of Kinkaku-ji and Fushimi Inari Taisha, every corner unveils a tale of tradition and tranquility. #Kyoto #TravelJapan", "Lost in the colorful labyrinth of Marrakech's bustling souks, where the scent of spices fills the air and vibrant textiles dance in the breeze. 🕌✨ Exploring hidden riads, savoring tagine delights, and getting lost in the magic of Jardin Majorelle. #Marrakech #TravelGoals 🌴🌞"]
preprocessed_data = preprocessing_pipeline.transform(new_data)
post_test = pd.DataFrame(preprocessed_data.toarray())
predicted = predict(best_model, post_test)
for doc, category in zip(new_data, predicted):
     print(f'{doc} =>\nnarcism: {category}')

Embarking on an exhilarating adventure through the enchanting streets of Kyoto, Japan. 🎌 From the serene bamboo forests of Arashiyama to the historic temples of Kinkaku-ji and Fushimi Inari Taisha, every corner unveils a tale of tradition and tranquility. #Kyoto #TravelJapan =>
narcism: 2.696610216764748
Lost in the colorful labyrinth of Marrakech's bustling souks, where the scent of spices fills the air and vibrant textiles dance in the breeze. 🕌✨ Exploring hidden riads, savoring tagine delights, and getting lost in the magic of Jardin Majorelle. #Marrakech #TravelGoals 🌴🌞 =>
narcism: 2.79216584649148


Testing it on data that is not talking about travel (the abortion posts)

In [62]:
tfidf = preprocessing_pipeline.transform(data["post_abortion"].values)

test_counts_ab = pd.DataFrame(tfidf.toarray())
test_counts_ab['narcissism'] = data[narcism_type]
X_test_ab = test_counts_ab.iloc[:, :-1]
y_test_ab = np.ravel(test_counts_ab[['narcissism']])
y_pred_ab = predict(best_model, X_test_ab)
mse = evaluate(y_test_ab, y_pred_ab)
print(f"SVR MSE test_ab: {mse}")

SVR MSE test_ab: 0.8148133279056374


It is interesting that the model rather correctly predicts the 'ADM' narcissism in abortion posts despite being trained on different data. MSE is still less then 1 point (on the scale 1 to 6)