In [35]:
!pip install scikit-learn==1.7.1


Collecting scikit-learn==1.7.1
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 508.0 kB/s eta 0:00:17
   -- ------------------------------------- 0.5/8.7 MB 508.0 kB/s eta 0:00:17
   -- ------------------------------------- 0.5/8.7 MB 508.0 kB/s eta 0:00:17
   -- -----------------------------------

  You can safely remove it manually.


In [36]:
import pandas as pd

# Load CSV
df = pd.read_csv('reply_classification_dataset.csv')

# Basic info
print(df.head())
print(df.info())
print(df['label'].value_counts())


                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB
None
label
neutral     704
positive    446
NEGATIVE    267
POSITIVE    263
Negative    254
negative    189
Neutral       3
NEUTRAL       2
Positive      1
Name: count, dtype: int64


In [37]:
df.isnull().sum()

reply    0
label    0
dtype: int64

## Preprocessing

In [38]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # remove urls
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation and numbers
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['reply'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\himan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Split Dataset

In [39]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Vectorizing 

In [40]:
#convert text to numerical data using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [41]:
import joblib

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

## Using SVM for the first Model

In [42]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

svm_model = SVC(kernel='linear', probability=True, random_state=42)

svm_model.fit(X_train_vec, y_train)

In [43]:
y_pred_svm = svm_model.predict(X_test_vec)

In [44]:
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM F1 Score:", f1_score(y_test, y_pred_svm, average='weighted'))

SVM Accuracy: 0.8380281690140845
SVM F1 Score: 0.835616068231551


In [45]:
import joblib

joblib.dump(svm_model, "svm_model.pkl")

['svm_model.pkl']

## #########-----------------------MODEL - 2 ----------------------------#####

## Used RandomForest

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_vec, y_train)




In [47]:
y_pred_rf = rf_model.predict(X_test_vec)

In [48]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf, average='weighted'))

Random Forest Accuracy: 0.8262910798122066
Random Forest F1 Score: 0.8250457628896017


In [49]:
import joblib

joblib.dump(rf_model, "randomForest_model.pkl")

['randomForest_model.pkl']