# Loading the data

In [21]:
import io
import pandas as pd
import numpy as np
from IPython.display import clear_output
from tqdm.auto import tqdm
from copy import deepcopy


import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.optim as optim
from torch import nn
from sklearn.calibration import LabelEncoder
import torch.utils.data as data_utils
from torch.utils.data import DataLoader, Dataset

# set seed 
seed = 0
torch.manual_seed(0)
np.random.seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
# Read cleaned data from csv
df = pd.read_csv('input/movie_data_tmbd_cleaned.csv', sep='|')

In [3]:
# Print info about the data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17425 entries, 0 to 17424
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 17425 non-null  int64  
 1   budget                17425 non-null  int64  
 2   genres                17410 non-null  object 
 3   original_language     17425 non-null  object 
 4   overview              17425 non-null  object 
 5   popularity            17425 non-null  float64
 6   production_companies  16811 non-null  object 
 7   production_countries  17253 non-null  object 
 8   revenue               17425 non-null  int64  
 9   runtime               17425 non-null  float64
 10  spoken_languages      17376 non-null  object 
 11  status                17425 non-null  object 
 12  tagline               17425 non-null  object 
 13  video                 17425 non-null  int64  
 14  vote_average          17425 non-null  float64
 15  vote_count         

In [4]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])  # Replace with your target column

In [5]:

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Encode the overview and tagline columns
overview_encoded = tfidf_vectorizer.fit_transform(df['overview']).toarray()
overview_encoded_df = pd.DataFrame(overview_encoded, columns=tfidf_vectorizer.get_feature_names_out())

tagline_encoded = tfidf_vectorizer.fit_transform(df['tagline']).toarray()
tagline_encoded_df = pd.DataFrame(tagline_encoded, columns=tfidf_vectorizer.get_feature_names_out())


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Function to preprocess the column
def preprocess_column(column):
    # Split the strings into lists
    return column.fillna('').str.split(',')

# Function to encode a list column with MultiLabelBinarizer
def encode_column(column):
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(preprocess_column(column)), mlb.classes_

# Encoding list-based features
genres_encoded, genres_classes = encode_column(df['genres'])
# production_companies_encoded, companies_classes = encode_column(df['production_companies'])
# production_countries_encoded, countries_classes = encode_column(df['production_countries'])
# cast_encoded, cast_classes = encode_column(df['cast'])
# directors_encoded, directors_classes = encode_column(df['directors'])

# Create DataFrames for the encoded list columns
genres_encoded_df = pd.DataFrame(genres_encoded, columns=genres_classes)
# production_companies_encoded_df = pd.DataFrame(production_companies_encoded, columns=companies_classes)
# production_countries_encoded_df = pd.DataFrame(production_countries_encoded, columns=countries_classes)
# cast_encoded_df = pd.DataFrame(cast_encoded, columns=cast_classes)
# directors_encoded_df = pd.DataFrame(directors_encoded, columns=directors_classes)


In [11]:
print(genres_encoded_df.head())

      Action  Adventure  Animation  Comedy  Crime  Documentary  Drama  Family  \
0  0       0          0          0       0      0            0      1       0   
1  0       0          0          0       0      0            1      0       0   
2  0       0          0          0       0      0            0      0       0   
3  0       0          0          0       0      0            0      1       0   
4  0       0          0          0       0      0            0      0       0   

   Fantasy  History  Horror  Music  Mystery  Romance  Science Fiction  \
0        0        0       0      0        0        0                0   
1        0        0       0      0        0        0                0   
2        0        1       0      0        0        0                0   
3        0        0       0      0        0        0                0   
4        0        0       0      0        0        0                0   

   TV Movie  Thriller  War  Western  
0         0         0    0        1 

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17425 entries, 0 to 17424
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 17425 non-null  int64  
 1   budget                17425 non-null  int64  
 2   genres                17410 non-null  object 
 3   original_language     17425 non-null  object 
 4   overview              17425 non-null  object 
 5   popularity            17425 non-null  float64
 6   production_companies  16811 non-null  object 
 7   production_countries  17253 non-null  object 
 8   revenue               17425 non-null  int64  
 9   runtime               17425 non-null  float64
 10  spoken_languages      17376 non-null  object 
 11  status                17425 non-null  object 
 12  tagline               17425 non-null  object 
 13  video                 17425 non-null  int64  
 14  vote_average          17425 non-null  float64
 15  vote_count         

In [12]:
unencoded_df = df[['adult', 'budget', 'popularity', 'runtime', 'vote_count', 'release_year', 'release_month', 'release_day']]
final_df = pd.concat([overview_encoded_df, tagline_encoded_df, 
                      genres_encoded_df, unencoded_df], axis=1)

# NN

In [16]:
X = final_df
Y = df['category']

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [18]:
# Create a custom Dataset class
class MovieDataset(Dataset):
    def __init__(self, features, labels):
        # Convert features to a NumPy array if it's a DataFrame
        if isinstance(features, pd.DataFrame):
            features = features.to_numpy()  # Convert DataFrame to NumPy array
        
        self.X = torch.tensor(features, dtype=torch.float32)  # Convert features to tensor
        self.y = torch.tensor(labels, dtype=torch.long)       # Convert labels to tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Assuming X_train and Y_train are your features and labels DataFrames
# Convert Y_train to a NumPy array if it's also a DataFrame
if isinstance(Y_train, pd.Series):
    Y_train = Y_train.to_numpy()

if isinstance(Y_test, pd.Series):
    Y_test = Y_test.to_numpy()

# Create Dataset objects
train_dataset = MovieDataset(X_train, Y_train)
test_dataset = MovieDataset(X_test, Y_test)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [19]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 64)           # Second hidden layer
        self.fc3 = nn.Linear(64, num_classes)   # Output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
input_size = X_train.shape[1]  # Number of input features
num_classes = len(np.unique(Y_train))  # Number of classes in the target
model = SimpleNN(input_size, num_classes)


In [20]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 64)           # Second hidden layer
        self.fc3 = nn.Linear(64, num_classes)   # Output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
input_size = X_train.shape[1]  # Number of input features
num_classes = len(np.unique(Y_train))  # Number of classes in the target
model = SimpleNN(input_size, num_classes)


In [28]:
# Define the neural network
class MovieClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MovieClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)  # First hidden layer with 128 neurons
        self.fc2 = nn.Linear(512, 128)           # Second hidden layer with 64 neurons
        self.fc3 = nn.Linear(128, 64)            # Third hidden layer with 32 neurons
        self.fc4 = nn.Linear(64, 16)            # Fourth hidden layer with 16 neurons
        self.output = nn.Linear(16, num_classes) # Output layer

        # Activation function
        self.relu = nn.ReLU()
        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.relu(self.fc4(x))
        x = self.output(x)  # Output layer (logits)
        return x
    
# Initialize the model
input_size = X_train.shape[1]  # Number of input features
num_classes = len(np.unique(Y_train))  # Number of classes in the target
model = MovieClassifier(input_size, num_classes)

In [29]:
# Set the device
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10  # Set the number of epochs
for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 2.2932
Epoch [2/10], Loss: 1.2116
Epoch [3/10], Loss: 1.0979
Epoch [4/10], Loss: 1.1804
Epoch [5/10], Loss: 1.3322
Epoch [6/10], Loss: 1.3068
Epoch [7/10], Loss: 1.2985
Epoch [8/10], Loss: 1.2965
Epoch [9/10], Loss: 1.2701
Epoch [10/10], Loss: 1.3098


In [30]:
# Evaluate the model
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)  # Get the predicted class
        total += labels.size(0)
        correct += (predicted == labels).sum().item()  # Count correct predictions

accuracy = 100 * correct / total
print(f'Accuracy of the model on the test set: {accuracy:.2f}%')


Accuracy of the model on the test set: 39.97%
