In [1]:
#importing relevant python libraries

import nltk
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

import numpy as np
import pandas as pd

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Loading the tags csv file into a dataframe
tags = pd.read_csv('tags.csv')
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [3]:
#Taking all the tags and creating a list
tag_list = list(tags['tag'])
len(tag_list)

3683

In [4]:
#converting all the tags into lower case 
for i in range(len(tag_list)):
    tag_list[i] = tag_list[i].lower()
tag_list

['funny',
 'highly quotable',
 'will ferrell',
 'boxing story',
 'mma',
 'tom hardy',
 'drugs',
 'leonardo dicaprio',
 'martin scorsese',
 'way too long',
 'al pacino',
 'gangster',
 'mafia',
 'al pacino',
 'mafia',
 'holocaust',
 'true story',
 'twist ending',
 'anthony hopkins',
 'courtroom drama',
 'twist ending',
 'britpop',
 'indie record label',
 'music',
 'dumpster diving',
 'sustainability',
 'romantic comedy',
 'wedding',
 'painter',
 'bloody',
 'black hole',
 'sci-fi',
 'time-travel',
 'fantasy',
 'magic board game',
 'robin williams',
 'beautiful scenery',
 'epic',
 'historical',
 'inspirational',
 'medieval',
 'mel gibson',
 'oscar (best cinematography)',
 'revenge',
 'sword fight',
 'black comedy',
 'christina ricci',
 'christopher lloyd',
 'dark comedy',
 'family',
 'gothic',
 'al pacino',
 'andy garcia',
 'classic',
 'francis ford coppola',
 'mafia',
 'black comedy',
 'christina ricci',
 'christopher lloyd',
 'family',
 'gothic',
 'quirky',
 'family',
 'funny',
 'macaula

In [5]:
#Using the WordNet Lemmatizer to find out number of unique tags
#Creating a dictionary to map the different variations of the same tag against it's unique name
lemmatizer = WordNetLemmatizer()
f=0

tag_lemmatized = {}
tag_lemmatized_list = []
for x in tag_list:
    lem_word = lemmatizer.lemmatize(x)
    tag_lemmatized_list.append(lem_word)
    if lem_word not in tag_lemmatized :
        tag_lemmatized[lem_word]=[x]
    else :
        tag_lemmatized[lem_word].append(x)

len(tag_lemmatized)

1459

In [6]:
#Storing the unique tag names in a separate list
unique_tag = []
unique_tags = tag_lemmatized.keys()
len(unique_tags)

1459

In [7]:
#Creating a unique tags dataframe and giving each unique tag a unique id
tag_id = []
tag_id = range(1,1460)
unique_tags_df = pd.DataFrame(unique_tags,columns=['tags'])
unique_tags_df['tag_id'] = tag_id
unique_tags_df = unique_tags_df[['tag_id', 'tags']]
unique_tags_df

Unnamed: 0,tag_id,tags
0,1,funny
1,2,highly quotable
2,3,will ferrell
3,4,boxing story
4,5,mma
...,...,...
1454,1455,70mm
1455,1456,for katie
1456,1457,austere
1457,1458,gun fu


In [8]:
#Storing the unique tags dataframe into a csv file
unique_tags_df.to_csv('Genome-tags.csv',index=False)

In [9]:
#Replacing the original tags in the csv file with the lemmatized ones
#This removes any discrepancies for later on
tags['tags'] = pd.Series(tag_lemmatized_list)
tags = tags.drop(['tag'],axis=1)
tags.head(10)

Unnamed: 0,userId,movieId,timestamp,tags
0,2,60756,1445714994,funny
1,2,60756,1445714996,highly quotable
2,2,60756,1445714992,will ferrell
3,2,89774,1445715207,boxing story
4,2,89774,1445715200,mma
5,2,89774,1445715205,tom hardy
6,2,106782,1445715054,drug
7,2,106782,1445715051,leonardo dicaprio
8,2,106782,1445715056,martin scorsese
9,7,48516,1169687325,way too long


In [10]:
#Merging the tags dataframe with the unique tags dataframe to get the corresponding tag ids
tags_with_ids = pd.merge(tags,unique_tags_df,on='tags')
tags_with_ids

Unnamed: 0,userId,movieId,timestamp,tags,tag_id
0,2,60756,1445714994,funny,1
1,62,2953,1525636885,funny,1
2,62,3114,1525636913,funny,1
3,62,60756,1528934381,funny,1
4,62,68848,1527274322,funny,1
...,...,...,...,...,...
3678,606,7382,1171234019,for katie,1456
3679,606,7936,1173392334,austere,1457
3680,610,3265,1493843984,gun fu,1458
3681,610,3265,1493843978,heroic bloodshed,1459


In [12]:
#Dropping the timestamp column
tags_with_ids = tags_with_ids.drop(['timestamp'], axis=1)
tags_with_ids

Unnamed: 0,userId,movieId,tags,tag_id
0,2,60756,funny,1
1,62,2953,funny,1
2,62,3114,funny,1
3,62,60756,funny,1
4,62,68848,funny,1
...,...,...,...,...
3678,606,7382,for katie,1456
3679,606,7936,austere,1457
3680,610,3265,gun fu,1458
3681,610,3265,heroic bloodshed,1459


In [13]:
#Dropping the tag names
tags_with_ids = tags_with_ids.drop(['tags'], axis=1)
tags_with_ids

Unnamed: 0,userId,movieId,tag_id
0,2,60756,1
1,62,2953,1
2,62,3114,1
3,62,60756,1
4,62,68848,1
...,...,...,...
3678,606,7382,1456
3679,606,7936,1457
3680,610,3265,1458
3681,610,3265,1459


In [14]:
#Storing the tags with ids dataframe into a csv file
tags_with_ids.to_csv('Tags_IDs.csv', index=False)