# Tweet classifier

### Import libs

In [13]:
import pandas as pd
import string
import re
import xgboost as xgb
import mlflow
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

### define functions

In [2]:
def remove_characters(text: str) -> str:
    """Remove non-letters from a given string"""
    remove_chars = string.punctuation
    translator = str.maketrans('', '', remove_chars)
    text = re.sub(r'\d+', '', text)

    return text.translate(translator)

In [3]:
def clean_text(text) -> str:
    """Keep only retain words in a given string"""
    text = remove_characters(str(text))
    return text.rsplit(' ', 1)[0]

In [4]:
def vectorize_text(tweets: list[str]):
    vectorizer = TfidfVectorizer(max_features=2500, min_df=1, max_df=0.8)
    return vectorizer.fit_transform(tweets).toarray()

In [5]:
def label_encoder(parties):
    encoder = LabelEncoder()
    return encoder.fit_transform(parties)

### Load data

In [6]:
df = pd.read_csv("data/Tweets.csv")
df.sample(5)

Unnamed: 0,Party,Tweet
21777,Republican,RT @SecretarySonny: While walking the halls at...
67874,Democrat,RT @RepPeterDeFazio: Wall Street is gearing up...
46542,Democrat,Looking for inspiration? Look no further than ...
19343,Democrat,Congrats to @MikaelaShiffrin! Check out her ru...
23766,Democrat,Only 3 days left to sign up for health insuran...


### Define training features X and target y

In [7]:
df.Tweet = df.Tweet.apply(clean_text)
X = vectorize_text(df.Tweet.values)
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
df.Party = df.Party.apply(clean_text)
y = label_encoder(df.Party.values)
y

array([1, 0, 0, ..., 1, 1, 1])

### Train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Set params and train model

In [10]:
params = {
    "objective": "binary:logistic",   # binary classification
    "eval_metric": "logloss",         # good for probabilistic classification

    # Tree parameters
    "max_depth": 6,                   # controls tree complexity (3–7 is typical for text data)
    "min_child_weight": 1,            # minimum sum of instance weight (hessian) needed in a child
    "gamma": 0,                       # min loss reduction to make a split
    "subsample": 0.8,                 # % of training samples used per tree
    "colsample_bytree": 0.8,          # % of features used per tree

    # Regularization
    "lambda": 1,                      # L2 regularization term
    "alpha": 0,                       # L1 regularization term

    # Learning rate
    "eta": 0.1,                       # step size shrinkage (0.05–0.3 works well)
    "n_estimators": 300,              # number of boosting rounds
}

model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### Evaluate results

In [11]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.61      0.67      8465
           1       0.68      0.80      0.73      8828

    accuracy                           0.71     17293
   macro avg       0.71      0.70      0.70     17293
weighted avg       0.71      0.71      0.70     17293



In [18]:
cr = classification_report(y_test, y_pred)

In [21]:
logged_model = 'file:///Users/henriquelouzada/repos/MLEng-politicalparties-python-exercise/mlruns/595336408377215950/models/m-359665d13f1345aa8d0181435c0fc9f6/artifacts'

loaded_model = mlflow.xgboost.load_model(logged_model)

In [23]:
y_pred = loaded_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.61      0.67      8465
           1       0.68      0.80      0.73      8828

    accuracy                           0.71     17293
   macro avg       0.71      0.70      0.70     17293
weighted avg       0.71      0.71      0.70     17293

