<a href="https://colab.research.google.com/github/jgoteti/llm/blob/main/My_Own_Classification_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

def transform_value(value):
  if value.isdigit():
    return 0
  else:
    return 1

# Load training data
train_data = pd.read_csv('train_data.csv')

# Load test data
test_data = pd.read_csv('test_data.csv')

test_data.head()

print(train_data['main_label'].value_counts())
print(test_data['main_label'].value_counts())



main_label
Not_Promotional    2272
Promotional         876
Name: count, dtype: int64
main_label
Not_Promotional    1143
Promotional         980
Name: count, dtype: int64


In [None]:
train_labels.shape[0]

3148

In [None]:
# Split training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data[['sender_id', 'text']], train_data['main_label'], test_size=0.2, random_state=42
)

print(train_texts.shape)
print(val_texts.shape)
print(train_labels.shape)
print(val_labels.shape)

(2518, 2)
(630, 2)
(2518,)
(630,)


In [None]:
# Vectorize text data using TF-IDF
# TODO: Use a better vectorizer
vectorizer = TfidfVectorizer()
train_texts_tfidf = vectorizer.fit_transform(train_texts['text'])
val_texts_tfidf = vectorizer.transform(val_texts['text'])
test_data_tfidf = vectorizer.transform(test_data['text'])

In [None]:
# convert sender to a binary feature
# TODO: make a pipeline instead of repeating the steps
train_sender = train_texts['sender_id'].apply(transform_value)
val_sender = val_texts['sender_id'].apply(transform_value)
test_sender = test_data['sender_id'].apply(transform_value)


In [None]:
from scipy.sparse import hstack, csr_matrix

print(train_sender.shape)
print(train_texts_tfidf.shape)
# print(csr_matrix(train_sender).reshape(2518, 1).shape)
# Convert binary feature to sparse matrix
train_sender_sparse = csr_matrix(train_sender).reshape(-1, 1)
val_sender_sparse = csr_matrix(val_sender).reshape(-1, 1)
test_sender_sparse = csr_matrix(test_sender).reshape(-1, 1)

# Combine features
combined_train_features = hstack([train_sender_sparse, train_texts_tfidf])
combined_val_features = hstack([val_sender_sparse, val_texts_tfidf])
combined_test_features = hstack([test_sender_sparse, test_data_tfidf])
print(combined_train_features.shape)
print(train_labels.shape)

(2518,)
(2518, 6812)
(2518, 6813)
(2518,)


In [None]:
# Create a neural network classifier
classifier = MLPClassifier(hidden_layer_sizes=(100, 50), activation='logistic', solver='adam',
                          alpha=0.0001, batch_size='auto', learning_rate='constant',
                          learning_rate_init=0.001, power_t=0.5, max_iter=200,
                          shuffle=True, random_state=None, tol=0.0001,
                          verbose=False, warm_start=False, momentum=0.9,
                          nesterovs_momentum=True, early_stopping=False,
                          validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
                          epsilon=1e-08, n_iter_no_change=10)

# Train the classifier
classifier.fit(combined_train_features, train_labels)

# Make predictions on validation set
val_predictions = classifier.predict(combined_val_features)

# Evaluate performance on validation set
val_accuracy = accuracy_score(val_labels, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Make predictions on test set
test_predictions = classifier.predict(combined_test_features)

print(test_predictions)

# Evaluate performance on test set
test_accuracy = accuracy_score(test_data['main_label'], test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9682539682539683
['Promotional' 'Not_Promotional' 'Promotional' ... 'Not_Promotional'
 'Not_Promotional' 'Not_Promotional']
Test Accuracy: 0.8365520489872822
