In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# drive dir
train_dir = '/content/drive/MyDrive/LN/Project/train.txt'
test_dir = '/content/drive/MyDrive/LN/Project/test_no_labels.txt'

# Load the dataset
train_data = pd.read_csv(train_dir, sep='\t', header=None, names=['title', 'from', 'genre', 'director', 'plot'])

# Load the test data
test_data = pd.read_csv(test_dir, sep='\t', names=['title', 'from', 'director', 'plot'])

# Look at the data structure
print(train_data.head())

# Extract the features (plots) and labels (genres)
X_train = train_data['plot']
y_train = train_data['genre']

X_test = test_data['plot']

                       title       from    genre       director  \
0               Ela Cheppanu     Telugu  romance         Ramana   
1  A Nightmare on Elm Street   American   horror   Samuel Bayer   
2            American Gothic   American   horror     John Hough   
3                       Gang  Bollywood    crime    Mazhar Khan   
4         Intimate Relations    British    drama  Charles Frank   

                                                plot  
0  Sekhar (Tarun) is a graduate from IIM and work...  
1  Kris Fowles (Katie Cassidy) goes to the Spring...  
2  Cynthia is traumatized by the death of her bab...  
3  Four friends, Gangu (Jackie Shroff), Abdul (Na...  
4  Crisis in a middle-class family when the son f...  


In [4]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the training data into TF-IDF matrix
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test data (use the same TF-IDF vectorizer fitted on the training data)
X_test_tfidf = tfidf.transform(X_test)

In [5]:
# Initialize the Support Vector Classifier
svc = SVC(kernel='linear')  # You can experiment with 'rbf', 'poly' kernels as well

# Train the classifier
svc.fit(X_train_tfidf, y_train)

In [6]:
# Use the trained model to predict the genres for the test set
test_predictions = svc.predict(X_test_tfidf)

results_dir = '/content/drive/MyDrive/LN/Project/results.txt'

with open(results_dir, 'w') as f:
    for genre in test_predictions:
        f.write(f"{genre}\n")

In [7]:
# Select the relevant columns (plot as feature, genre as label)
X = train_data['plot']  # features (movie plots)
y = train_data['genre']  # target labels (movie genres)

# Split the dataset into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  # Adjust max_features as needed

# Fit and transform the training data, and transform the test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Initialize the Support Vector Classifier
svc = SVC(kernel='linear')  # Linear kernel is commonly used for text classification

# Train the model
svc.fit(X_train_tfidf, y_train)

# Predict the genres for the test set
y_pred = svc.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Get a detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 62.46%
              precision    recall  f1-score   support

      action       0.57      0.62      0.59       202
   animation       0.80      0.61      0.69       115
      comedy       0.45      0.52      0.48       223
       crime       0.60      0.37      0.46       108
       drama       0.49      0.64      0.56       319
      horror       0.82      0.80      0.81       223
     romance       0.59      0.46      0.52       186
      sci-fi       0.68      0.44      0.53        39
     western       0.93      0.89      0.91       194

    accuracy                           0.62      1609
   macro avg       0.66      0.59      0.62      1609
weighted avg       0.64      0.62      0.63      1609

