In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import csv


In [2]:
# Load the training dataset
train_file_path = 'train_data.txt'
train_df = pd.read_csv(train_file_path, delimiter=':::', engine='python', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


In [3]:
# Load the test dataset
test_file_path = 'test_data.txt'
test_df = pd.read_csv(test_file_path, delimiter=':::', engine='python', header=None, names=['ID', 'TITLE', 'DESCRIPTION'])


In [4]:
# Pre-processing: fill missing values, encode categorical variables
# Add more code here for pre-processing and feature engineering if needed

# Prepare data for machine learning
X_train, y_train = train_df['DESCRIPTION'], train_df['GENRE']


In [5]:
# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [6]:
# TF-IDF Vectorization for training and validation sets
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [None]:
# Build the model (Logistic Regression)
genre_model = LogisticRegression()
genre_model.fit(X_train_tfidf, y_train)


In [7]:
# Make predictions on the validation set
genre_pred = genre_model.predict(X_val_tfidf)


In [None]:
# Evaluate the model on the validation set
accuracy = accuracy_score(y_val, genre_pred)
conf_matrix = confusion_matrix(y_val, genre_pred)
classification_rep = classification_report(y_val, genre_pred)


In [9]:
# Print the results
print(f"Accuracy on Validation Set: {accuracy}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)


Accuracy on Validation Set: 0.5788988287374343

Confusion Matrix:
 [[  67    0    1    0    0   29    1   34   93    0    0    0    0   11
     0    0    0    0    0    0    4    7    4    0   12    0    0]
 [   0   24   13    0    0   32    0    6   27    0    0    0    0    1
     0    0    0    0    0    0    0    7    0    0    1    0    1]
 [   5    0   21    0    0   18    0   28   35    1    0    0    0   10
     0    0    0    0    1    0    4   10    0    0    3    0    3]
 [   1    0    2   10    0   22    0   17   23    8    1    0    0    2
     1    0    0    0    0    0    6   11    0    0    0    0    0]
 [   0    0    0    0    0    3    0   38   16    0    0    0    0    0
     0    0    0    0    0    0    0    4    0    0    0    0    0]
 [   6    1    1    1    0  832    1   87  429    2    0    0    0   12
     2    1    0    0    7    1    1   49    2    1    4    0    3]
 [   7    0    0    0    0   19    3   13   48    0    0    0    0    6
     0    0    0    0

In [10]:
# TF-IDF Vectorization for test data
X_test_tfidf = tfidf_vectorizer.transform(test_df['DESCRIPTION'])


In [11]:
# Make predictions on the test set
test_predictions = genre_model.predict(X_test_tfidf)


In [12]:
# Add the predictions to the test dataset
test_df['PREDICTED_GENRE'] = test_predictions


In [13]:
# Save the test dataset with predictions
output_file_path = 'test_data_solution.txt'
test_df.to_csv(output_file_path, sep=',', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')


In [14]:
# Read the content from the file and replace the comma with ':::'
with open(output_file_path, 'r', encoding='utf-8') as file:
    content = file.read().replace(',', ':::')

# Write the modified content back to the file
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(content)