In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

**Loading the Dataset**

In [None]:
def load_data(file_path):
    x = []
    y = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Each line format: "ID ::: TITLE ::: GENRE ::: DESCRIPTION"
            parts = line.strip().split(':::')
            if len(parts) == 4:
                plot_summary = parts[3]
                genre = parts[2]
                x.append(plot_summary)
                y.append(genre)
    return x, y

**Preprocessing the Data**

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

**Loading the Preprocessed Data**

In [None]:
training_data_path ='/content/train_data.txt'
X_train, y_train = load_data(training_data_path)
# X_train = [preprocess_text(description) for description in X_train]

**Feature Extraction**

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

**Training the Model using Logistic Regression**

In [None]:
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Loading and Preprocessing the Testing Data**

In [None]:
def load_test_data(file_path):
    X = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(':::')
            if len(parts) == 3:  # Ensure all parts are present
                title = parts[1].strip()
                description = parts[2].strip()
                X.append(description)
    return X

In [None]:
test_file_path = '/content/test_data.txt'
X_test = load_test_data(test_file_path)
# X_test = [preprocess_text(description) for description in X_test]

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

**Evaluating the Model**

In [None]:
y_pred = classifier.predict(X_test_tfidf)
#Printing Predictions
for i, prediction in enumerate(y_pred):
    print("Prediction for title '{}': {}".format(X_test[i], prediction))

Prediction for title 'L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has been wiped out. L.R.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding Vespa.':  drama 
Prediction for title 'Spain, March 1964: Quico is a very naughty child of three belonging to a wealthy middle-class family. Since Cristina's birth, he feels he has lost the privileged position of "prince" of the house for his eight months old sister. So, with his brother Juan, who is eight years old and is quite disobedient, spend their time committing prank after prank, causing the resulting anger of his mother, the nanny and the old housemaid. The rest

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Prediction for title 'When Alan's sister Emily is diagnosed with a terminal illness. The one thing that can save her is a transplant. Strapped for cash, he does everything possible to raise the money needed to get the life saving operation for his sister; while at the same time trying to cope with his own failures in life. He takes a big gamble on a card game in hopes of not only saving his sister but himself.':  drama 
Prediction for title 'Eric (David Guthrie) returns to his hometown to oversee the closing of his families record store currently run by his half sister, Kristin (Amanda Pereira). However, between pressure from his girlfriend and the sudden presence of a secretive girl named Valerie, things start to change as Eric realizes that sometimes you need a little push to see something for what it was: not what it became.':  short 
Prediction for title 'Facing death, terminally ill Parvis Karimpour wants to reconcile with his daughter Nasrin. He and his fellow African travellers

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re  # Import re module for regular expressions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load training data
train_path = "/content/train_data.txt"
train_data = pd.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine='python')

# Load test data
test_path = "/content/test_data.txt"
test_data = pd.read_csv(test_path, sep=':::', names=['Id', 'Title', 'Description'], engine='python')

# Load test solution
test_soln_path = "/content/test_data_solution.txt"
test_soln_data = pd.read_csv(test_soln_path, sep=':::', names=['Id', 'Title', 'Description'], engine='python')
test_soln_data.drop(test_soln_data.columns[[0, 2]], axis=1, inplace=True)
test_soln_data.rename(columns={'Title': 'Actual Genre'}, inplace=True)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Preprocess text
train_data['Title'] = train_data['Title'].apply(preprocess_text)
train_data['Description'] = train_data['Description'].apply(preprocess_text)
test_data['Title'] = test_data['Title'].apply(preprocess_text)
test_data['Description'] = test_data['Description'].apply(preprocess_text)

# Concatenate Title and Description for vectorization
train_text = train_data['Title'] + ' ' + train_data['Description']
test_text = test_data['Title'] + ' ' + test_data['Description']

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform on the training data
X_train = tfidf_vectorizer.fit_transform(train_text)

# Transform the test data
X_test = tfidf_vectorizer.transform(test_text)

# Split the data into training and validation sets
y = train_data['Genre']
X_train, X_val, y_train, y_val = train_test_split(train_text, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train a Logistic Regression Classifier
clf_logreg = LogisticRegression(multi_class='multinomial', solver='sag')
X_train_tfidf = tfidf_vectorizer.transform(X_train)
clf_logreg.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
X_val_tfidf = tfidf_vectorizer.transform(X_val)
y_pred = clf_logreg.predict(X_val_tfidf)

# Evaluate the performance of the model
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print(classification_report(y_val, y_pred))
