#### import data

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
import re 
import string
import nltk 
import string 
from nltk. corpus import stopwords 
from nltk.stem import SnowballStemmer, WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer 
import xgboost as xgb 
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score 
from sklearn.tree import DecisionTreeClassifier
import mlflow
import mlflow.sklearn

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
df = df.sample(100)

In [4]:
df.drop(columns=['tweet_id'], inplace=True)

#### data pre processing

In [5]:
final_df = df[df['sentiment'].isin(['happiness','sadness'])]

In [6]:
final_df['sentiment'] = final_df['sentiment'].replace(
    {
        'happiness': 1,
        'sadness': 0
    }
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment'] = final_df['sentiment'].replace(


In [7]:
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=42)

In [8]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iampr\AppData\Roaming\nltk_data...


[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iampr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(y) for y in text]
    return " ".join(text)

def remove_stop_words (text):
    stop_words = set(stopwords.words('english'))
    Text = [i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

def removing_numbers(text):
    text = "".join([i for i in text if not i.isdigit()])
    return text

def lower_case(text):
    text = text.split()
    text = [y.lower() for y in text]
    return " ".join(text)


def removing_punctuations(text):
    # Remove punctuation using regex and string.punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    
    #remove extra whitespace
    text = re.sub('\s+',' ', text)
    text = " ".join(text.split())
    return text.strip()


def removing_urls(text):
    url_pattern = re.compile(r'https://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[1] = np.nan
            
            
def normalize_text(df):
    df.content = df.content.apply(lambda content : lower_case(content))
    df.content = df.content.apply(lambda content : remove_stop_words(content))
    df.content = df.content.apply(lambda content : removing_numbers(content))
    df.content = df.content.apply(lambda content : removing_punctuations(content))
    df.content = df.content.apply(lambda content : removing_urls(content))
    df.content = df.content.apply(lambda content : lemmatization(content))    
    return df


def normalize_sentence(sentence):
    sentence = lower_case(sentence)
    sentence = remove_stop_words(sentence)
    sentence = removing_numbers(sentence)
    sentence = removing_punctuations(sentence)
    sentence = removing_urls(sentence)
    sentence = lemmatization(sentence)
    return sentence

In [10]:
train_data = normalize_text(train_data)
test_data = normalize_text(test_data)

In [11]:
#extract x-train, x-test, y-train, y-test
X_train = train_data['content'].values
y_train = train_data['sentiment'].values

X_test = test_data['content'].values
y_test = test_data['sentiment'].values

#### text vectorization - BOW

In [12]:
#bow vectorizer
vectorizer = CountVectorizer(max_features=10)
x_train_bow = vectorizer.fit_transform(X_train)
x_test_bow = vectorizer.transform(X_test)

#### MLFlow experiment tracking

In [13]:
import dagshub
dagshub.init(repo_owner='iamprashantjain', repo_name='mlops-mini-project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/iamprashantjain/mlops-mini-project.mlflow")
mlflow.set_experiment('Decision Tree Baseline')

<Experiment: artifact_location='mlflow-artifacts:/7f52cd073c034a88bca79863caa6a1e7', creation_time=1745838305051, experiment_id='1', last_update_time=1745838305051, lifecycle_stage='active', name='Decision Tree Baseline', tags={}>

#### applying decision tree

In [14]:
with mlflow.start_run():
    mlflow.log_param("vectorizer", "BOW")
    mlflow.log_param("num_features", 10)
    mlflow.log_param("test_size", 0.2)
            
    # Train model
    dt = DecisionTreeClassifier()
    dt.fit(x_train_bow, y_train)
    mlflow.log_param("model","Decision Tree")
    
    #evaluation   
    y_pred = dt.predict(x_test_bow)
    accuracy = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Log metrics and params
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', prec)
    mlflow.log_metric('recall', rec)
    mlflow.log_metric('f1_score', f1)
 

    #log model
    mlflow.sklearn.log_model(dt, "Decision_Tree_Model")
    
    #log jupyter notebook
    import os
    notebook_path = "exp1.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)
    
    
    print("accuracy: ", accuracy)
    print("precision: ", prec)
    print("recall: ", rec)
    print("f1_score: ", f1)



accuracy:  0.6
precision:  0.6
recall:  1.0
f1_score:  0.7499999999999999


2025/04/28 18:36:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run amazing-sloth-261 at: https://dagshub.com/iamprashantjain/mlops-mini-project.mlflow/#/experiments/1/runs/af533942830d41488f324330f79d9256.


2025/04/28 18:36:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mlops-mini-project.mlflow/#/experiments/1.
