#### ellali

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# read data

In [4]:
train_label = pd.read_csv('WN23_data/WN23_PA_training_labels.txt', sep=',')

In [5]:
train_label

Unnamed: 0,TweetID,Label
0,1001,0
1,1002,1
2,1003,0
3,1004,0
4,1005,0
...,...,...
3995,4996,1
3996,4997,1
3997,4998,1
3998,4999,0


In [6]:
train_tweets = pd.read_csv('WN23_data/WN23_PA_training_tweets.txt', sep=',', encoding = 'ISO-8859-1')
train_tweets

Unnamed: 0,TweetID,TimeOfDay,Tweet
0,1001,17,The Bulldogs have been selected to finish 4th ...
1,1002,22,Played disc golf. Got a tattoo. Heading to Det...
2,1003,16,Sunday big football game I'm gunna gather all ...
3,1004,20,Despite my resolution to be nicer to Scooter t...
4,1005,18,Reassigned by Michigan Runner to shoot Goodlif...
...,...,...,...
3995,4996,23,completed his food and exercise diary for 5/01...
3996,4997,22,It's a great feeling when all of your pants ar...
3997,4998,22,Pretty freaking excited for fishing season rig...
3998,4999,23,"Ok, now that's just funny. Õ¬Õ__ Social coach..."


In [7]:
# join together

In [8]:
train_data = pd.merge(train_tweets, train_label, on="TweetID", how="left")
train_data

Unnamed: 0,TweetID,TimeOfDay,Tweet,Label
0,1001,17,The Bulldogs have been selected to finish 4th ...,0
1,1002,22,Played disc golf. Got a tattoo. Heading to Det...,1
2,1003,16,Sunday big football game I'm gunna gather all ...,0
3,1004,20,Despite my resolution to be nicer to Scooter t...,0
4,1005,18,Reassigned by Michigan Runner to shoot Goodlif...,0
...,...,...,...,...
3995,4996,23,completed his food and exercise diary for 5/01...,1
3996,4997,22,It's a great feeling when all of your pants ar...,1
3997,4998,22,Pretty freaking excited for fishing season rig...,1
3998,4999,23,"Ok, now that's just funny. Õ¬Õ__ Social coach...",0


In [9]:
train_data = train_data.drop("TimeOfDay", axis=1)
train_data

Unnamed: 0,TweetID,Tweet,Label
0,1001,The Bulldogs have been selected to finish 4th ...,0
1,1002,Played disc golf. Got a tattoo. Heading to Det...,1
2,1003,Sunday big football game I'm gunna gather all ...,0
3,1004,Despite my resolution to be nicer to Scooter t...,0
4,1005,Reassigned by Michigan Runner to shoot Goodlif...,0
...,...,...,...
3995,4996,completed his food and exercise diary for 5/01...,1
3996,4997,It's a great feeling when all of your pants ar...,1
3997,4998,Pretty freaking excited for fishing season rig...,1
3998,4999,"Ok, now that's just funny. Õ¬Õ__ Social coach...",0


In [10]:
# split 8/2

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_data['Tweet'], train_data['Label'], test_size=0.2, random_state=42)

In [11]:
X_train

3994    "@FamousNegro: When the whole squad turnt __\n...
423     First you get a swimming pool full of Starbuck...
2991    I'm 25 what y'all want me to keep running arou...
1221    #NowPlaying "Midnight on the Run" by BoomBox f...
506     Excited for my first day as the KHMS dance coa...
                              ...                        
1130    Of course one of my fav movies and Sarah &amp;...
1294    @stjude  Getting prepped for the Advokate 1 mi...
860                            I need to go back to dance
3507    I had 3 volleyball games yesterday all in a ro...
3174    We missed Saturday, time to make up some for f...
Name: Tweet, Length: 3200, dtype: object

In [12]:
# clean

import nltk
from nltk import word_tokenize    
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re


stop_words = set(stopwords.words('english'))

In [13]:
def clean_data(data):
    
    data = re.sub("@[A-Za-z0-9_]+","", data)
    data = re.sub(r"(?:\@|https?\://)\S+", '', data)
    data = re.sub(r'[^\w\s]', '', data)
    data = word_tokenize(data.lower())
    words = [word for word in data if word.isalpha() and word not in stop_words] # Remove stop words and non-alphabetic characters
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    
    cleaned_sentence = " ".join(words)
    
    return cleaned_sentence

In [14]:
X_train = X_train.apply(clean_data)
X_train

3994              whole squad turnt walk brandon basement
423             first get swim pool full starbuck diiiiiv
2991          im yall want keep run around like im someth
1221      nowplay midnight run boombox vision backbeat äª
506     excit first day khm danc coach gon na pretti cute
                              ...                        
1130    cours one fav movi sarah amp mike first danc h...
1294    get prep advok run week good luck michigan run...
860                                     need go back danc
3507    volleybal game yesterday row im sore cant even...
3174    miss saturday time make fit fun zumba bokwa di...
Name: Tweet, Length: 3200, dtype: object

In [15]:
X_valid = X_valid.apply(clean_data)
X_valid

555         bit as tri walk ici drive way go along winter
3491       rain rain stay away want abl see kickbal today
527     yoga south lawn detroit institut art detroit m...
3925    packard plant polish origin archiv digitaltrav...
2989                 complet mi run runkeep check runkeep
                              ...                        
1922    marriott surf club part timeshar resort realli...
865     think know kpop minwookevilvalenstrif super ju...
3943                                 mention cook egg lol
1642    ron howard alway comfort voic carri episod arr...
2483    two week real nfl season start cant wait guess...
Name: Tweet, Length: 800, dtype: object

In [16]:
# feature extractor

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf_vec = TfidfVectorizer(stop_words='english')
X_train = tfidf_vec.fit_transform(X_train)
X_valid = tfidf_vec.transform(X_valid)

In [19]:
print('shape of x_train features:', X_train.shape)
print('shape of x_validation features:', X_valid.shape)
print('word-index mapping dictionary:', len(tfidf_vec.vocabulary_))

shape of x_train features: (3200, 5998)
shape of x_validation features: (800, 5998)
word-index mapping dictionary: 5998


In [20]:
# model - classifier

from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [44]:
###### svm

model_svm = svm.SVC(C=1, kernel='linear') 
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.75      0.87      0.80       521
           1       0.65      0.45      0.53       279

    accuracy                           0.72       800
   macro avg       0.70      0.66      0.67       800
weighted avg       0.71      0.72      0.71       800



In [22]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()

model_nb.fit(X_train, y_train)

y_pred = model_nb.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.67      0.98      0.80       521
           1       0.73      0.11      0.20       279

    accuracy                           0.68       800
   macro avg       0.70      0.55      0.50       800
weighted avg       0.69      0.68      0.59       800



In [23]:
####### svm hyper

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

model_svm = SVC()

grid_search = GridSearchCV(model_svm, param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best hyperparameters:  {'C': 1, 'kernel': 'linear'}
Best score:  0.76


In [24]:
## logi

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model_lr = LogisticRegression()

model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.94      0.81       521
           1       0.72      0.31      0.44       279

    accuracy                           0.72       800
   macro avg       0.72      0.62      0.62       800
weighted avg       0.72      0.72      0.68       800



In [25]:

# param_grid = {
#     'C': [0.1, 1, 10],
#     'penalty': ['l1', 'l2'],
#     'solver': ['liblinear', 'saga'],
#     'max_iter': [100, 200, 500]
# }


# model_lr = LogisticRegression()

# grid_search = GridSearchCV(model_lr, param_grid, cv=5)

# grid_search.fit(X_train, y_train)

# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_valid)

# report = classification_report(y_valid, y_pred)
# print(report)

In [26]:
# # svc

# from sklearn import svm
# # Define the parameter grid to search over
# param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# # Create an SVC classifier
# model_svc = svm.SVC()

# # Create a grid search object with 5-fold cross-validation
# grid_search = GridSearchCV(model_svc, param_grid, cv=5)

# # Fit the grid search object to the training data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and the best score
# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# # Make predictions on the validation set using the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_valid)

# # Compute the classification report for the best model
# report = classification_report(y_valid, y_pred)
# print(report)

In [27]:
# # mlp

# from sklearn.neural_network import MLPClassifier

# # Define the parameter grid to search over
# param_grid = {
#     'hidden_layer_sizes': [(10,), (50,), (100,)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'alpha': [0.1, 1, 10]
# }

# # Create an MLP classifier
# model_mlp = MLPClassifier()

# # Create a grid search object with 5-fold cross-validation
# grid_search = GridSearchCV(model_mlp, param_grid, cv=5)

# # Fit the grid search object to the training data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and the best score
# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# # Make predictions on the validation set using the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_valid)

# # Compute the classification report for the best model
# report = classification_report(y_valid, y_pred)
# print(report)

In [28]:
# # dt

# from sklearn.tree import DecisionTreeClassifier

# # Define the parameter grid to search over
# param_grid = {'max_depth': [5, 10, 20, None], 'min_samples_split': [2, 5, 10]}

# # Create a Decision Tree classifier
# model_dt = DecisionTreeClassifier()

# # Create a grid search object with 5-fold cross-validation
# grid_search = GridSearchCV(model_dt, param_grid, cv=5)

# # Fit the grid search object to the training data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and the best score
# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# # Make predictions on the validation set using the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_valid)

# # Compute the classification report for the best model
# report = classification_report(y_valid, y_pred)
# print(report)

In [29]:
# # rf

# from sklearn.ensemble import RandomForestClassifier

# param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20, None]}

# model_rf = RandomForestClassifier()

# grid_search = GridSearchCV(model_rf, param_grid, cv=5)

# grid_search.fit(X_train, y_train)

# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_valid)

# report = classification_report(y_valid, y_pred)
# print(report)

In [30]:
#===============================================================================================

In [22]:
###### mlp

model_mlp = MLPClassifier(alpha=1, max_iter=500)
model_mlp.fit(X_train, y_train)
y_pred = model_mlp.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.91      0.81       521
           1       0.69      0.38      0.49       279

    accuracy                           0.73       800
   macro avg       0.71      0.65      0.65       800
weighted avg       0.72      0.72      0.70       800



In [31]:
###### DT

model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.78      0.78       521
           1       0.59      0.57      0.58       279

    accuracy                           0.71       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.71      0.71      0.71       800



In [32]:
####### adaboost

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

tree = DecisionTreeClassifier()

boost = AdaBoostClassifier(base_estimator=tree)

boost.fit(X_train, y_train)

y_pred = boost.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.87      0.80       521
           1       0.63      0.43      0.51       279

    accuracy                           0.71       800
   macro avg       0.69      0.65      0.65       800
weighted avg       0.70      0.71      0.70       800



In [34]:
# !pip install imblearn


In [35]:
######## SMOTE

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
clf = DecisionTreeClassifier()
clf.fit(X_train_resampled, y_train_resampled)
y_pred = clf.predict(X_valid)

report = classification_report(y_valid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       521
           1       0.55      0.57      0.56       279

    accuracy                           0.69       800
   macro avg       0.66      0.66      0.66       800
weighted avg       0.69      0.69      0.69       800



In [36]:
# test set

In [41]:
test_data = pd.read_csv('WN23_data/WN23_PA_test_tweets.txt', sep=',', encoding='iso-8859-1')
tweet_id = test_data['TweetID']
test_data

Unnamed: 0,TweetID,TimeOfDay,Tweet
0,5001,19,"MI Playmakers - Green 46, FABE 24, 5th Grade -..."
1,5002,22,I biked 2.46 mi with @mapmyride. Check out my ...
2,5003,20,"@Jacob_Nash yes I do, it's just I said hi Best..."
3,5004,20,Countless hours of house hunting has taught me...
4,5005,23,"1 year ago today, Nik Wallenda walked across t..."
...,...,...,...
995,5996,22,When you wonder why people are staring and you...
996,5997,7,Brutal killing of a samba queen exposes ...
997,5998,1,"You're so cavalier, because you're lost before..."
998,5999,2,House hunting is on hold for a little bit ¨ü__


In [42]:
# clean and feature extract

test_tweets = test_data['Tweet']
test_tweets = test_tweets.apply(clean_data)
test_tweets = tfidf_vec.transform(test_tweets)


In [45]:
# get prediction 

prediction = model_svm.predict(test_tweets)

In [46]:
tweet_id

0      5001
1      5002
2      5003
3      5004
4      5005
       ... 
995    5996
996    5997
997    5998
998    5999
999    6000
Name: TweetID, Length: 1000, dtype: int64

In [47]:
result = pd.DataFrame({'TweetID': tweet_id, 'Label': prediction})
result

Unnamed: 0,TweetID,Label
0,5001,1
1,5002,1
2,5003,0
3,5004,0
4,5005,0
...,...,...
995,5996,0
996,5997,0
997,5998,0
998,5999,0


In [48]:
result.to_csv('result_svm_newwwww.csv', index=False)