Objective:- Intensity Analysis

Loading directories

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

Load data

In [2]:
angriness_df = pd.read_csv('/Users/kuriankgeorge/Desktop/aws/Capstone/10_Intensity/Intensity_data/angriness.csv')
happiness_df = pd.read_csv('/Users/kuriankgeorge/Desktop/aws/Capstone/10_Intensity/Intensity_data/happiness.csv')
sadness_df = pd.read_csv('/Users/kuriankgeorge/Desktop/aws/Capstone/10_Intensity/Intensity_data/sadness.csv')

Concatenate all data into one DataFrame

In [3]:
data = pd.concat([angriness_df, happiness_df, sadness_df], ignore_index=True)

In [4]:
data.head()

Unnamed: 0,content,intensity
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angriness
1,Not available for busy people☺,angriness
2,I do not exist to impress the world. I exist t...,angriness
3,Everything is getting expensive except some pe...,angriness
4,My phone screen is brighter than my future 🙁,angriness


In [5]:
data.shape

(2039, 2)

Encode the intensity labels

In [6]:
le = LabelEncoder()
data['intensity'] = le.fit_transform(data['intensity'])

In [7]:
data.head()

Unnamed: 0,content,intensity
0,"Sometimes I’m not angry, I’m hurt and there’s ...",0
1,Not available for busy people☺,0
2,I do not exist to impress the world. I exist t...,0
3,Everything is getting expensive except some pe...,0
4,My phone screen is brighter than my future 🙁,0


Split into training and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['intensity'], test_size=0.2, random_state=42)

Build a text classification pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),             # Convert text to TF-IDF features
    ('clf', LogisticRegression(max_iter=1000)) # Logistic Regression for classification
])

Train the model

In [10]:
pipeline.fit(X_train, y_train)

Predict on the test set

In [11]:
y_pred = pipeline.predict(X_test)

Evaluate the model

In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.7769607843137255

Classification Report:
               precision    recall  f1-score   support

   angriness       0.82      0.80      0.81       152
   happiness       0.70      0.83      0.76       137
     sadness       0.84      0.69      0.76       119

    accuracy                           0.78       408
   macro avg       0.79      0.77      0.78       408
weighted avg       0.79      0.78      0.78       408



Cross-validation

In [14]:
cv_scores = cross_val_score(pipeline, data['content'], data['intensity'], cv=5)
print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())


Cross-Validation Accuracy Scores: [0.55147059 0.76715686 0.74509804 0.70098039 0.79361179]
Mean Cross-Validation Accuracy: 0.7116635351929469


Saving the model

In [15]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, 'intensity_analysis_model.joblib')

['intensity_analysis_model.joblib']