In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/Users/grace/Documents/SSTP/all_loneliness_posts.csv',encoding='utf-8')
df.drop(columns=['id','full_link'],inplace=True)
df['feelLonely'] = df['feelLonely'].astype(int)
df['text'] = df['text'].astype(str)
df

In [None]:
stop_words = stopwords.words('english')
stemmer = nltk.SnowballStemmer("english")

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002702-\U000027B0"  # other miscellaneous symbols
                              u"\U000024C2-\U0001F251"  # enclosed characters
                              "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_hashtags(text):
    pattern = r'(\B#\w+\b)|(\#\w+\b\s*$)'
    return re.sub(pattern, '', text)
def remove_specialchars(text):
    filtered_sent=[]
    for char in text.split(' '):
        if char=='&' or char=='$':
            filtered_sent.append('')
        else:
            filtered_sent.append(char)
    return ' '.join(filtered_sent)
def remove_extraspace(text):
    return re.sub('\s\s+','',text)
def clean_text(text):
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text) 
    # Remove links
    text = re.sub(r'http\S+|www\S+|\S+\.\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text) 
    # Remove \r\n new line characters
    text = text.replace('\r\n', ' ')
    return text

In [None]:
def preprocess_data(text):
    text = text.lower()
    text = remove_emojis(text)  
    text = remove_hashtags(text)                                                    
    text = remove_specialchars(text)                                                    
    text = remove_extraspace(text)                                                    
    text = clean_text(text)                                                    
    text = ' '.join(word for word in text.split() if word not in stop_words)    
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())                
    return text.lower()

In [None]:
df['cleaned'] = df['text'].apply(preprocess_data)
df

In [None]:
#removing the posts that contain 'removed' since they're deleted from the reddit
df = df[df['cleaned'].str.contains('removed') == False]
df

In [None]:
pip install wordninja

In [None]:
import wordninja

In [None]:
#split words that are combined together
for i in df.index:
    df.at[i,'cleaned'] = ' '.join(wordninja.split(df.at[i,'cleaned']))
df

In [None]:
df[df['feelLonely']==0].count()

In [None]:
df[df['feelLonely']==1].count()

In [None]:
import wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
#word cloud for not lonely posts
wc1 = WordCloud(background_color="white", max_words=50, stopwords=stop_words,
                   max_font_size=40)
wc1.generate(df.cleaned[df.feelLonely == 0].to_string())
plt.imshow(wc1)
plt.title('not lonely')
plt.axis("off")
plt.show()

In [None]:
#word cloud for lonely posts
wc2 = WordCloud(background_color="white", max_words=50, stopwords=stop_words,
                   max_font_size=40)
wc2.generate(df.cleaned[df.feelLonely == 1].to_string())
plt.imshow(wc2)
plt.title('lonely')
plt.axis("off")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#features
X = df.cleaned
#label
y = df.feelLonely
print(X.shape)
print(y.shape)

In [None]:
#split data into training and test (80/20)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=df.feelLonely)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
X_train_cv.shape

In [None]:
X_test_cv.shape

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_cv)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score ,accuracy_score, precision_score,recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve, validation_curve

In [None]:
#testing hyperparameters

learning_rates = [0.08, 0.1, 0.16]
max_depths = [6, 8, 10]
num_estimators = [100,150,200,250]

for learning_rate in learning_rates:
    for max_depth in max_depths:
        for n_estimators in num_estimators:
            print(learning_rate,max_depth,n_estimators)

            pipe = Pipeline([
                        ('bow', CountVectorizer()), 
                        ('tfidf', TfidfTransformer()),  
                        ('model', xgb.XGBClassifier(
                        objective='binary:logistic',
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        n_estimators=n_estimators,
                        eval_metric='logloss'))
                 ])
    
            # Fit the pipeline with the data
            pipe.fit(X_train, y_train)

            y_pred = pipe.predict(X_test)   
            report = classification_report(y_test, y_pred)
            print(report)

In [None]:
# best params found above: learning rate-0.1, max tree depth-6, num trees-200
# make xgb model with best params
xgb_model = xgb.XGBClassifier(
            objective='binary:logistic',
            learning_rate=0.1,
            max_depth=6,
            n_estimators=200,
            use_label_encoder=False,
            eval_metric='logloss')

pipe = Pipeline([
            ('bow', CountVectorizer()), 
            ('tfidf', TfidfTransformer()),  
            ('model', xgb_model)
        ])
    

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test) 
report = classification_report(y_test, y_pred)
print(report)
print(f1_score(y_test,y_pred))

In [None]:
#pipeline showing flow of data
from sklearn import set_config
set_config(display='diagram')
pipe

In [None]:
#confusion matrix
cm = confusion_matrix(y_test,y_pred)
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
display.plot()
plt.title("confusion matrix")
plt.show()

In [None]:
#overall metrics
print('Accuracy Score:', round(accuracy_score(y_test, y_pred),2))
print('Precision:', round(precision_score(y_test,y_pred),2))
print('Recall:', round(recall_score(y_test,y_pred),2))
print('F1 score:', round(f1_score(y_test,y_pred),2))