In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Define keywords for each genre
genre_keywords = {
    'Action': ['action', 'fight', 'battle', 'hero','adrenaline','rush'],
    'Comedy': ['comedy', 'funny', 'humor', 'laugh','hilarious','laugh'],
    'Horror': ['horror', 'scary', 'fear', 'ghost'],
    'Drama': ['drama', 'emotional', 'heartbreaking', 'tragic'],
    'Romance':['love','sweetheart','relationship','couple'],
    'Survival':['disaster','calamity','storm','tornado','tsunami','earthquake'],
    'Science Fiction': ['science fiction', 'alien', 'future', 'space', 'technology'],
    'Fantasy': ['fantasy', 'magic', 'mythical', 'dragon', 'wizard'],
    'Mystery': ['mystery', 'detective', 'investigation', 'suspense', 'crime'],
    'Adventure': ['adventure', 'journey', 'explore', 'treasure', 'quest'],
    'Animation': ['animation', 'cartoon', 'animated', 'fantastic', 'colorful'],
    'Horror': ['horror', 'scary', 'fear', 'terror', 'spooky', 'haunted', 'creepy'],
    'Reality TV': ['reality TV', 'competition', 'contestant', 'elimination', 'drama', 'unscripted', 'challenge'],
    'Family': ['family', 'children', 'parenting', 'bonding', 'uplifting'],
    'War': ['war', 'military', 'combat', 'soldier', 'battlefield'],
    'Musical': ['musical', 'music', 'song', 'dance', 'performance'],
    'Western': ['western', 'cowboy', 'frontier', 'outlaw', 'gunfight','farm','country'],
    'Sports': ['sports', 'athlete', 'competition', 'victory', 'team'],
    'Superhero': ['superhero', 'superpowers', 'comic', 'vigilante', 'cape'],
    'Romantic Comedy': ['romantic comedy', 'rom-com', 'date', 'relationship', 'funny'],
    'Historical': ['historical', 'period', 'historical figure', 'costume', 'epoch'],
    'Noir': ['noir', 'dark', 'gritty', 'crime', 'mystery'],
    'Satire': ['satire', 'irony', 'mockery', 'parody', 'ridicule'],
    'Biopic': ['biopic', 'biographical', 'life story', 'portrayal', 'real person'],
    'Psychological': ['psychological', 'mind', 'psyche', 'mental', 'surreal'],
    'Musical Drama': ['musical drama', 'music', 'drama', 'song', 'performance'],
    'Political': ['political', 'politics', 'government', 'power', 'election'],
    'Fantasy Adventure': ['fantasy adventure', 'magic', 'adventure', 'quest', 'mythical'],
    'Spy': ['spy', 'espionage', 'intelligence', 'agent', 'secret mission'],
    'Teen': ['teen', 'high school', 'coming of age', 'teenager', 'youth'],
    'Road Trip': ['road trip', 'journey', 'travel', 'friendship', 'adventure'],
    'Courtroom Drama': ['courtroom drama', 'trial', 'lawyer', 'legal', 'justice'],
    'Space Opera': ['space opera', 'galaxy', 'alien', 'spacecraft', 'epic'],
    'Art House': ['art house', 'avant-garde', 'experimental', 'independent', 'auteur'],
    'Buddy Cop': ['buddy cop', 'partnership', 'police', 'comedy', 'crime'],
    'Heist': ['heist', 'robbery', 'criminal', 'plan', 'scheme'],
    'Biographical': ['biographical', 'life story', 'history', 'individual', 'journey'],
    'Narrative': ['narrative', 'story', 'plot', 'character', 'development'],
    'Martial Arts': ['martial arts', 'kung fu', 'karate', 'fighting', 'combat']


}

# Function to assign genre based on keywords
def assign_genre(description):
    for genre, keywords in genre_keywords.items():
        for keyword in keywords:
            if keyword in description.lower():
                return genre
    return 'Other'  # If no genre-specific keyword is found, assign as 'Other' genre

# Read the first 30 rows of the training data
train_data = pd.read_csv('/train_data.txt', sep=' ::: ', engine='python', header=None,
                         names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], nrows=200)

# Read the first 30 rows of the testing data
test_data = pd.read_csv('/test_data.txt', sep=' ::: ', engine='python', header=None,
                        names=['ID', 'TITLE', 'DESCRIPTION'], nrows=200)

# Extract features from the training data
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION'])
y_train = train_data['GENRE']

# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Extract features from the testing data
X_test = tfidf_vectorizer.transform(test_data['DESCRIPTION'])

# Make predictions
predictions = svm_classifier.predict(X_test)

# Apply genre assignment function to descriptions in the testing data
test_data['PREDICTED_GENRE'] = test_data['DESCRIPTION'].apply(assign_genre)

# Print the results
print(test_data[['ID', 'TITLE', 'PREDICTED_GENRE']])




      ID                                          TITLE PREDICTED_GENRE
0      1                           Edgar's Lunch (1998)         Romance
1      2                       La guerra de papá (1977)          Family
2      3                    Off the Beaten Track (2010)      Reality TV
3      4                         Meu Amigo Hindu (2015)          Family
4      5                              Er nu zhai (1955)           Drama
..   ...                                            ...             ...
195  196  Foot Reflexology: A Step by Step Guide (2009)   Psychological
196  197   Moses: Fallen. In the City of Angels. (2005)          Action
197  198                      "The Startup Hour" (2015)          Action
198  199                      The Great Asteroid (2007)         Romance
199  200                                  Marlon (2008)          Comedy

[200 rows x 3 columns]


In [None]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Mount Google Drive
drive.mount('/content/drive')

# Define keywords for each genre
genre_keywords = {
    'Action': ['action', 'fight', 'battle', 'hero','adrenaline','rush'],
    'Comedy': ['comedy', 'funny', 'humor', 'laugh','hilarious','laugh'],
    'Horror': ['horror', 'scary', 'fear', 'ghost'],
    'Drama': ['drama', 'emotional', 'heartbreaking', 'tragic'],
    'Romance':['love','sweetheart','relationship','couple'],
    'Survival':['disaster','calamity','storm','tornado','tsunami','earthquake'],
    'Science Fiction': ['science fiction', 'alien', 'future', 'space', 'technology'],
    'Fantasy': ['fantasy', 'magic', 'mythical', 'dragon', 'wizard'],
    'Mystery': ['mystery', 'detective', 'investigation', 'suspense', 'crime'],
    'Adventure': ['adventure', 'journey', 'explore', 'treasure', 'quest'],
    'Animation': ['animation', 'cartoon', 'animated', 'fantastic', 'colorful'],
    'Horror': ['horror', 'scary', 'fear', 'terror', 'spooky', 'haunted', 'creepy'],
    'Reality TV': ['reality TV', 'competition', 'contestant', 'elimination', 'drama', 'unscripted', 'challenge'],
    'Family': ['family', 'children', 'parenting', 'bonding', 'uplifting'],
    'War': ['war', 'military', 'combat', 'soldier', 'battlefield'],
    'Musical': ['musical', 'music', 'song', 'dance', 'performance'],
    'Western': ['western', 'cowboy', 'frontier', 'outlaw', 'gunfight','farm','country'],
    'Sports': ['sports', 'athlete', 'competition', 'victory', 'team'],
    'Superhero': ['superhero', 'superpowers', 'comic', 'vigilante', 'cape'],
    'Romantic Comedy': ['romantic comedy', 'rom-com', 'date', 'relationship', 'funny'],
    'Historical': ['historical', 'period', 'historical figure', 'costume', 'epoch'],
    'Noir': ['noir', 'dark', 'gritty', 'crime', 'mystery'],
    'Satire': ['satire', 'irony', 'mockery', 'parody', 'ridicule'],
    'Biopic': ['biopic', 'biographical', 'life story', 'portrayal', 'real person'],
    'Psychological': ['psychological', 'mind', 'psyche', 'mental', 'surreal'],
    'Musical Drama': ['musical drama', 'music', 'drama', 'song', 'performance'],
    'Political': ['political', 'politics', 'government', 'power', 'election'],
    'Fantasy Adventure': ['fantasy adventure', 'magic', 'adventure', 'quest', 'mythical'],
    'Spy': ['spy', 'espionage', 'intelligence', 'agent', 'secret mission'],
    'Teen': ['teen', 'high school', 'coming of age', 'teenager', 'youth'],
    'Road Trip': ['road trip', 'journey', 'travel', 'friendship', 'adventure'],
    'Courtroom Drama': ['courtroom drama', 'trial', 'lawyer', 'legal', 'justice'],
    'Space Opera': ['space opera', 'galaxy', 'alien', 'spacecraft', 'epic'],
    'Art House': ['art house', 'avant-garde', 'experimental', 'independent', 'auteur'],
    'Buddy Cop': ['buddy cop', 'partnership', 'police', 'comedy', 'crime'],
    'Heist': ['heist', 'robbery', 'criminal', 'plan', 'scheme'],
    'Biographical': ['biographical', 'life story', 'history', 'individual', 'journey'],
    'Narrative': ['narrative', 'story', 'plot', 'character', 'development'],
    'Martial Arts': ['martial arts', 'kung fu', 'karate', 'fighting', 'combat']


}

# Function to assign genre based on keywords
def assign_genre(description):
    for genre, keywords in genre_keywords.items():
        for keyword in keywords:
            if keyword in description.lower():
                return genre
    return 'Other'  # If no genre-specific keyword is found, assign as 'Other' genre

# Read the first 50 rows of the training data
train_data = pd.read_csv('/train_data.txt', sep=' ::: ', engine='python', header=None,
                         names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'],nrows=200)

# Read the first 50 rows of the testing data
test_data = pd.read_csv('/test_data.txt', sep=' ::: ', engine='python', header=None,
                        names=['ID', 'TITLE', 'DESCRIPTION'],nrows=200)

# Extract features from the training data
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION'])
y_train = train_data['GENRE']

# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Generate predictions on the entire training dataset
train_predictions = svm_classifier.predict(X_train)

# Add predicted genres to the training data
train_data['PREDICTED_GENRE'] = train_predictions

# Save the first 50 rows of the train data to a CSV file in Google Drive
train_data.iloc[:200].to_csv('/content/drive/MyDrive/200FINAL_training_data.csv', index=False)

print("CSV file saved successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CSV file saved successfully.
