## Remove - Unnecessary Tags - using TF-IDF Approach

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Data Access
url = "/Users/abhishekjaiswal/Downloads/combined_movie_genres_tags.csv"
df = pd.read_csv(url)

# Step 2: Data Preprocessing
df['tagsWithName'] = df['tagsWithName'].apply(eval)

# Step 3: Calculate TF-IDF vectors for tags
tag_strings = df['tagsWithName'].apply(lambda x: ' '.join(x))

# Initialize a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the tag strings to obtain TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(tag_strings)

# Step 4: Calculate cosine similarity matrix
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 5: Remove tags with low semantic similarity
threshold = 0.2  # Adjust the threshold as needed

# Iterate over the indices where similarity is below threshold and remove tags
for i in range(len(df)):
    for j in range(i + 1, len(df)):  # Avoid comparing the same movie
        if cos_sim_matrix[i, j] < threshold:
            for tag in df['tagsWithName'].iloc[i]:
                if tag in df['tagsWithName'].iloc[j]:
                    df['tagsWithName'].iloc[j].remove(tag)

# Step 6: Data Saving or Further Analysis
print(df.head())

# Save - to a new CSV file
df.to_csv("/Users/abhishekjaiswal/Downloads/cleaned_movie_data.csv", index=False)

   movieID                                              genre  \
0        1  ['Adventure', 'Animation', 'Children', 'Comedy...   
1        2               ['Adventure', 'Children', 'Fantasy']   
2        3                              ['Comedy', 'Romance']   
3        4                     ['Comedy', 'Drama', 'Romance']   
4        5                                         ['Comedy']   

                                          tagsWithID  \
0  [896, 900, 1925, 10376, 1290, 13, 528, 6675, 1...   
1  [14371, 742, 488, 2057, 1224, 2732, 13, 10382,...   
2               [13668, 2724, 3242, 7444, 380, 3196]   
3                                                 []   
4                 [2724, 1646, 752, 4946, 4953, 125]   

                                        tagsWithName  
0  [tumeys vhs, witty, want, rousing, almost favo...  
1  [joe johnston, game, lebbat, jungle, monkey, k...  
2  [old, grun running, jack lemmon, funniest movi...  
3                                                 []

## Remove -  Tags Below a Frequency Level

In [25]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Step 1: Data Retrieval
cleaned_data = "/Users/abhishekjaiswal/Downloads/cleaned_movie_data.csv"
df1 = pd.read_csv(cleaned_data)

# Step 2: Data Preprocessing
df1['tagsWithName'] = df1['tagsWithName'].apply(eval)

# Step 3: Analysis (Identify unnecessary tags)
all_tags = [tag for sublist in df['tagsWithName'] for tag in sublist]
tag_freq = pd.Series(all_tags).value_counts()

# Step 4: Filtering (Remove unnecessary tags)
unnecessary_tags = tag_freq[tag_freq < 10].index

def filter_tags(tag_list):
    return [tag for tag in tag_list if tag not in unnecessary_tags]

df1['tagsWithName'] = df1['tagsWithName'].apply(filter_tags)

# Step 5: Removing [] from tagswithID and tagswithName
df1 = df1[~((df1['tagsWithID'] == '[]') | (df1['tagsWithName'] == '[]'))]

# Step 5: Data Saving or Further Analysis
print(df1.head())

# Save - to a new CSV file
# df1.to_csv("/Users/abhishekjaiswal/Downloads/cleaned_movie_data1.csv", index=False)

   movieID                                              genre  \
0        1  ['Adventure', 'Animation', 'Children', 'Comedy...   
1        2               ['Adventure', 'Children', 'Fantasy']   
2        3                              ['Comedy', 'Romance']   
4        5                                         ['Comedy']   
5        6                    ['Action', 'Crime', 'Thriller']   

                                          tagsWithID  \
0  [896, 900, 1925, 10376, 1290, 13, 528, 6675, 1...   
1  [14371, 742, 488, 2057, 1224, 2732, 13, 10382,...   
2               [13668, 2724, 3242, 7444, 380, 3196]   
4                 [2724, 1646, 752, 4946, 4953, 125]   
5  [3008, 675, 13128, 2538, 587, 7052, 6988, 528,...   

                                        tagsWithName  
0  [disney animated feature, animation, disney, w...  
1                                                 []  
2                                                 []  
4                                           [sequel]


## Check Spelling - using SpellChecker

In [26]:
from spellchecker import SpellChecker

# Load your DataFrame
#df2 = pd.read_csv("/Users/abhishekjaiswal/Downloads/your_data.csv")

# Initialize spell checker
spell_checker = SpellChecker()

# Function to correct misspelled words
def correct_spelling(tag):
    corrected_tag = spell_checker.correction(tag)
    if corrected_tag is not None:
        corrected_tag = corrected_tag.lower()  # Convert to lowercase if correction is not None
    return corrected_tag if corrected_tag else tag  # Return original tag if no correction found

# Apply the correction function to each tag
df1['tagsWithName'] = df1['tagsWithName'].apply(lambda x: [correct_spelling(tag) for tag in x])

# Optional: Remove duplicates within each list in 'tagsWithName'
df1['tagsWithName'] = df1['tagsWithName'].apply(lambda x: list(set(x)))

df1.to_csv("/Users/abhishekjaiswal/Downloads/moreAccurate_data.csv", index=False)

## Remove - Spaces between Words

In [8]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize text into words
def word_segment(text):
    return word_tokenize(str(text))  # Convert to string

# Apply word segmentation to each tag
df1_exploded['tagsWithName'] = df1_exploded['tagsWithName'].apply(word_segment)

# Group by the original index and aggregate the lists back into one
df1_grouped = df1_exploded.groupby(df1_exploded.index).agg(list)

# Now, df1_grouped contains the segmented words in the 'tagsWithName' column
df1_grouped.to_csv("/Users/abhishekjaiswal/Downloads/moreAccurate_data.csv", index=False)

## Removing - [ ] 

In [28]:
import csv

# Define the input file path
file_path = '/Users/abhishekjaiswal/Downloads/moreAccurate_data.csv'

# Read data from the CSV file
data = []
with open(file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Convert string representation of lists to actual lists
        row['genre'] = eval(row['genre'])
        row['tagsWithID'] = eval(row['tagsWithID'])
        row['tagsWithName'] = eval(row['tagsWithName'])
        data.append(row)

# Filter out rows with empty values in tagsWithName
filtered_data = [row for row in data if row['tagsWithName']]


df1_grouped.to_csv("/Users/abhishekjaiswal/Downloads/moreAccurate_data.csv", index=False)

{'movieID': '1', 'genre': ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'], 'tagsWithID': [896, 900, 1925, 10376, 1290, 13, 528, 6675, 15123, 7705, 25, 672, 5289, 6323, 10420, 819, 10422, 55, 4924, 60, 15170, 15045, 326, 2119, 459, 11344, 465, 11477, 605, 14562, 362, 752, 4339, 501, 4342, 2293, 382], 'tagsWithName': ['dinner', 'animation', 'want to see again', 'disney animated feature']}
{'movieID': '5', 'genre': ['Comedy'], 'tagsWithID': [2724, 1646, 752, 4946, 4953, 125], 'tagsWithName': ['sequel']}
{'movieID': '10', 'genre': ['Action', 'Adventure', 'Thriller'], 'tagsWithID': [161, 485, 74, 459, 1515, 110, 6800, 657, 977, 5843, 9015, 9816, 634, 125, 1023], 'tagsWithName': ['murder', 'james bond', 'bond', 'assassin', '007 (series)', 'killer as protagonist']}
{'movieID': '11', 'genre': ['Comedy', 'Drama', 'Romance'], 'tagsWithID': [6401, 587, 651, 780, 689, 892, 10494], 'tagsWithName': ['seen', 'seen more than once']}
{'movieID': '12', 'genre': ['Comedy', 'Horror'], 'tagsWit

In [29]:
import csv

input_file_path = '/Users/abhishekjaiswal/Downloads/moreAccurate_data.csv'
output_file_path = '/Users/abhishekjaiswal/Downloads/filtered_data.csv'

data = []
with open(input_file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Convert string representation of lists to actual lists
        row['genre'] = eval(row['genre'])
        row['tagsWithID'] = eval(row['tagsWithID'])
        row['tagsWithName'] = eval(row['tagsWithName'])
        data.append(row)

# Filter out rows with empty values in tagsWithName
filtered_data = [row for row in data if row['tagsWithName']]

# Filtered data to a new CSV file
fieldnames = ['movieID', 'genre', 'tagsWithID', 'tagsWithName']
with open(output_file_path, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write filtered rows
    for row in filtered_data:
        writer.writerow(row)

print("Filtered data saved to:", output_file_path)


Filtered data saved to: /Users/abhishekjaiswal/Downloads/filtered_data.csv
