In [None]:
#IMDB and Bibsonomy Dataset generation for Team Formation
#Karan Vombatkere
#Spring 2022

#Imports
import random, json, time
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## IMDB Datasets

### Extract and pre-process IMDB data by movie year

In [None]:
#Function to run IMDB data generation process
#INPUT: int movieYear - minimum movie year to consider, to subset data from movies from movieYear - present
def extractIMDBDataByYear(movieYear):
    startTime = time.perf_counter()
    imdb_fp = '/usr3/graduate/kvombat/Team-Formation/Datasets/imdb_datasets/raw_data/'
    
    #1. Read in principals data
    principals_df = pd.read_csv(imdb_fp+'title_principals.tsv', sep='\t', header=0)
    principals_ss = principals_df[principals_df['category'].isin(['director','actor','actress'])]
    principals_ss = principals_ss[['tconst','nconst','category']]
    
    
    #2. Read in movie titles
    titles_df = pd.read_csv(imdb_fp+'title_basics.tsv', sep='\t', header=0)
    
    #Keep only movies with valid genres filter to columns needed, and subset movies >= movieYear
    titles_df_movies = titles_df[(titles_df.titleType == 'movie') & (titles_df.genres != '\\N')
                                 & (titles_df.startYear != '\\N')]
    titles_df_movies = titles_df_movies[titles_df_movies.startYear.astype(int) >= movieYear]
    
    #filter to columns needed, and calculate number of genres
    movies_df = titles_df_movies[['tconst','primaryTitle','startYear','genres']].reset_index(drop=True)
    movies_df['numGenres'] = movies_df.genres.apply(lambda x: len(x.split(',')))
    
    
    #3. Merge principals and movies
    principal_title_joined = principals_ss.merge(movies_df, left_on='tconst', right_on='tconst')
    print("Merged principals, movies for movieYear >= {}, principal_title_joined df shape = {}"
          .format(movieYear, principal_title_joined.shape))
    
    
    #4. Get Genres for each principal grouped and aggregated
    genres_grouped = principal_title_joined.groupby(['nconst'])['genres'].apply(lambda x:','.join(x.astype(str))).reset_index()
    genres_grouped.rename(columns={'genres':'allGenres'}, inplace=True)
    genres_grouped['allGenres'] = genres_grouped.allGenres.str.lower().apply(lambda x: list(set(x.split(','))))
    
    principal_genres_df = principal_title_joined[['nconst','category']].merge(genres_grouped,
                                                                              left_on='nconst', right_on='nconst')
    principal_genres_df.drop_duplicates(subset=['nconst'], ignore_index=True, inplace=True)
    print("Merged principals & grouped genres, principal_genres_df shape = {}".format(principal_genres_df.shape))
    
    #5. Add principal names and subset to people born after 1950
    names_df = pd.read_csv(imdb_fp+'name_basics.tsv', sep='\t', header=0)
    names_ss = names_df[~names_df.deathYear.str.isnumeric() & names_df.birthYear.str.isnumeric()]
    names_recent = names_ss[names_ss.birthYear.astype(int) >= 1950]
    names_recent = names_recent[['nconst', 'primaryName']]
    genres_names_df = principal_genres_df.merge(names_recent, left_on='nconst', right_on='nconst')
    
    genres_names_df.drop(columns=['nconst'], inplace=True)
    genres_names_df['category'].loc[genres_names_df['category'] == 'actress'] = 'actor'
    
    genres_names_df['allGenres'] = genres_names_df.allGenres.apply(lambda x: ','.join(x))
    
    # Get count of genres
    genres_names_df['numGenres'] = genres_names_df.allGenres.apply(lambda x: len(x.split(',')))
    genres_names_df.sort_values(by=['category'], inplace=True)
    genres_names_df.reset_index(drop=True, inplace=True)
    
    print("Added Names and Genre counts, genres_names_df shape = {}"
          .format(genres_names_df.shape))
    
    #6. Extract total counts of genre words
    genreCounts = {}
    for i, genre_string in enumerate(genres_names_df.allGenres.iteritems()):
        genre_words = genre_string[1].split(',')
        for w in genre_words:
            if genreCounts.get(w) is None:
                genreCounts[w] = 1
            else:
                genreCounts[w] += 1
    
    # Create Genre ID Dict
    genre_id_dict = {}
    for indx, genre_name in enumerate(list(genreCounts.keys())):
        genre_id_dict[genre_name] = indx
        
    print("Completed pre-processing IMDB dataset for movieYear >=", movieYear)
    
    runTime = time.perf_counter() - startTime
    print("IMDB data generation run time = {:.1f} seconds".format(runTime))
    
    return genres_names_df, genreCounts, genre_id_dict
    
    

In [None]:
#Run extraction and pre-processing code
movie_year_imdb = 2020

genres_names, counts_dict, id_dict = extractIMDBDataByYear(movie_year_imdb)

In [None]:
counts_dict

### Generate expert (directors) skills lists

In [None]:
def extract_skills(genre_string):
    skill_ids = []
    genre_strings_list = genre_string.split(',')

    for skill in genre_strings_list:
        skill_ids.append(str(id_dict[skill]))
    return skill_ids      


#Skills list for directors_df
def create_expert_skills_list():
    directors_df = genres_names[genres_names['category']=='director']
    directors_df.reset_index(drop=True, inplace=True)
    
    print("Directors skills distribution:", directors_df.numGenres.describe())
    directors_df.hist(column = ['numGenres'], bins=8, figsize=(8,5))
    
    directors_df['expert_skills'] = directors_df.allGenres.apply(lambda x: extract_skills(x))
    expert_skills_list = directors_df.expert_skills.to_list()
    
    print("Successfully generated experts skills list:", len(expert_skills_list))
    return expert_skills_list


#Skills list for actors_df
def create_tasks_skills_list():
    actors_df = genres_names[genres_names['category']=='actor']
    actors_df.reset_index(drop=True, inplace=True)
    
    print("\nActors skills distribution:", actors_df.numGenres.describe())
    actors_df.hist(column = ['numGenres'], bins=8, figsize=(8,5))
    
    actors_df['tasks_skills'] = actors_df.allGenres.apply(lambda x: extract_skills(x))
    tasks_skills_list = actors_df.tasks_skills.to_list()
    
    print("Successfully generated tasks skills list:", len(tasks_skills_list))
    return tasks_skills_list

In [None]:
expSkills = create_expert_skills_list()
taskSkills = create_tasks_skills_list()

In [None]:
#Save to disk
imdb_outpath = '/usr3/graduate/kvombat/Team-Formation/Datasets/imdb_datasets/'

experts_filename = 'imdb_experts_'+ str(movie_year_imdb) + '.txt'
with open(imdb_outpath + experts_filename, 'w') as f:
    f.write(json.dumps(expSkills))
    
tasks_filename = 'imdb_tasks_' + str(movie_year_imdb) + '.txt'
with open(imdb_outpath + tasks_filename, 'w') as f:
    f.write(json.dumps(taskSkills))

### Read in final IMDB data from txt files

In [None]:
read_path = '/usr3/graduate/kvombat/Team-Formation/Datasets/imdb_datasets/'

with open(read_path + 'imdb_experts.txt', 'r') as f:
    expert_skills_list = json.loads(f.read())
    
with open(read_path + 'imdb_tasks.txt', 'r') as f:
    task_skills_list = json.loads(f.read())
    
print("Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))

## Bibsonomy Datasets

### Extract and pre-process Bibsonomy data by paper year

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

bibsonomy_fp = '/usr3/graduate/kvombat/Team-Formation/Datasets/bibsonomy_datasets/'
wordlist_file = open(bibsonomy_fp+"english_wordlist.txt")
english_words = wordlist_file.read()
word_vocab = english_words.split('\n')

def filter_tags(tags_string):
    clean_tags_list = []
    for tag in tags_string.split(","):
        if tag in word_vocab and tag not in stop_words:
            clean_tags_list.append(tag)
            
    clean_tags = ",".join(f for f in clean_tags_list)
    return clean_tags

def trim_string(s):
    if s[0] == ",":
        s = s[1:]
    return s

In [None]:
#Function to run Bibsonomy data generation process
#INPUT: int paperYear - minimum paper year to consider, to subset data from papers from paperYear - present
def extractBibsonomyDataByYear(paperYear):
    startTime = time.perf_counter()
    bibsonomy_fp = '/usr3/graduate/kvombat/Team-Formation/Datasets/bibsonomy_datasets/'
    
    #1. Import tas data and filter to bibtex
    tas_df = pd.read_table(bibsonomy_fp+'tas', sep='\t',usecols=[1,2,3])
    tas_df.columns = ['tag', 'content_id', 'content_type']
    tas_df = tas_df[tas_df.content_type == 2]
    tas_df.reset_index(inplace=True, drop=True)
    
    print("Imported tas data:", tas_df.shape)
    
    #2. Import bibtex data
    bibtex_df = pd.read_table(bibsonomy_fp+'bibtex', sep='\t', usecols=[0,32,34], 
                              encoding='utf-8', error_bad_lines=False, warn_bad_lines=False, engine='python')
    bibtex_df.columns = ['content_id','author','year']
    
    print("Imported bibtex data:", bibtex_df.shape)
    
    #Filter NaN data for content_id, author and year
    bibtex_df = bibtex_df[~bibtex_df.content_id.isna()]
    bibtex_df = bibtex_df[~bibtex_df.author.isna()]
    bibtex_df = bibtex_df[~bibtex_df.year.isna()]

    #Filter for recent paperYear years
    bibtex_df = bibtex_df[bibtex_df.year.str.isdigit()]
    bibtex_df.year = bibtex_df.year.astype(int)
    bibtex_df = bibtex_df[bibtex_df.year >= paperYear]

    #Keep only valid years and content id
    bibtex_df = bibtex_df[bibtex_df.content_id.str.isdigit()]
    bibtex_df.content_id = bibtex_df.content_id.astype(int)

    bibtex_df.reset_index(inplace=True, drop=True)
    print("Filtered bibtex data to paperYear >= {}, bibtex_df shape: {}".format(paperYear, bibtex_df.shape))
    
    
    #3. Group and aggregate tags by author
    b = pd.DataFrame(bibtex_df.author.str.split(' and ').tolist(), index=bibtex_df.content_id).stack()
    b = b.reset_index()[[0, 'content_id']] # var1 variable is currently labeled 0
    b.columns = ['author', 'content_id'] # renaming var1
    #join author and tags
    bibtex_tags_df = b.merge(tas_df, on='content_id')
    
    #Group and aggregate tags by author
    tags_grouped = bibtex_tags_df.groupby(['author'])['tag'].apply(lambda x:','.join(x.astype(str))).reset_index()
    tags_grouped.rename(columns={'tag':'allTags'}, inplace=True)
    tags_grouped['allTags'] = tags_grouped.allTags.str.lower().apply(lambda x: list(set(x.split(','))))
    tags_grouped['allTags'] = tags_grouped.allTags.apply(lambda x: ','.join(x))
    
    print("Grouped and aggregated tags by author, tags_grouped shape: {}".format(tags_grouped.shape))
    
    #4. Clean Tags
    tags_grouped.allTags = tags_grouped.allTags.apply(lambda x: filter_tags(x))
    tags_grouped_clean = tags_grouped[tags_grouped.allTags != '']
    tags_grouped_clean.allTags = tags_grouped_clean.allTags.apply(lambda x: trim_string(x))
    tags_grouped_clean['numTags'] = tags_grouped_clean.allTags.apply(lambda x: len(x.split(',')))

    #Consider authors with only 3 or more tags
    tags_grouped_clean = tags_grouped_clean[tags_grouped_clean.numTags >= 3]
    print("Cleaned tags, tags_grouped_clean shape: {}".format(tags_grouped_clean.shape))
    
    #5. Extract total counts of tag words
    tagCounts = {}
    for i, tag_string in enumerate(tags_grouped_clean.allTags.iteritems()):
        tag_words = tag_string[1].split(',')
        for w in tag_words:
            if tagCounts.get(w) is None:
                tagCounts[w] = 1
            else:
                tagCounts[w] += 1
                
    #Extract vocab of common tags
    tagCounts_list = [(k,v) for k,v in tagCounts.items()]
    tagCounts_list.sort(key = lambda x: x[1], reverse=True)
    
    print("Len tags count list:", len(tagCounts_list))
    tags_vocab_list = [x[0] for x in tagCounts_list[1:1000]]
    
    def filter_tags_subset(tags_string):
        clean_tags_list = []
        for tag in tags_string.split(","):
            if tag in tags_vocab_list:
                clean_tags_list.append(tag)

        clean_tags = ",".join(f for f in clean_tags_list)
        return clean_tags
    
    tags_grouped_clean.allTags = tags_grouped_clean.allTags.apply(lambda x: filter_tags_subset(x))
    tags_grouped_clean = tags_grouped_clean[tags_grouped_clean.allTags != '']
    tags_grouped_clean['numTags'] = tags_grouped_clean.allTags.apply(lambda x: len(x.split(',')))
    
    tags_grouped_clean = tags_grouped_clean[(tags_grouped_clean.numTags >= 3)
                                            & (tags_grouped_clean.numTags <= 100)]
    tags_grouped_clean.reset_index(inplace=True, drop=True)
    
    print("Filtered tags on common tags, tags_grouped_clean shape: {}".format(tags_grouped_clean.shape))

    runTime = time.perf_counter() - startTime
    print("Bibsonomy data generation run time = {:.1f} seconds".format(runTime))
    
    return tags_grouped_clean, tags_vocab_list
    

In [None]:
#Run extraction and pre-processing code
paper_year_bibsonomy = 2000

bibtex_tags, tags_vocab = extractBibsonomyDataByYear(paper_year_bibsonomy)

In [None]:
bibtex_tags.numTags.describe()

### Generate bibsonomy expert and task skills lists

In [None]:
prolific_threshold = 12

In [None]:
experts_df = bibtex_tags[bibtex_tags.numTags >= prolific_threshold]
tasks_df = bibtex_tags[bibtex_tags.numTags < prolific_threshold]
print(experts_df.shape, tasks_df.shape)

In [None]:
experts_df.hist(column=['numTags'], bins=10, figsize=(8,5))

In [None]:
tasks_df.hist(column=['numTags'], bins=5, figsize=(8,5))

In [None]:
bibtex_tags.hist(column=['numTags'], bins=20, figsize=(8,5))

In [None]:
tag_id_dict = {}
for indx, tag_name in enumerate(tags_vocab):
    tag_id_dict[tag_name] = indx

In [None]:
def extract_skills(tag_string):
    skill_ids = []
    tag_strings_list = tag_string.split(',')

    for skill in tag_strings_list:
        skill_ids.append(str(tag_id_dict[skill]))
    return skill_ids      


#Skills list for experts - prolific authors
def create_expert_skills_list():
    experts_df['expert_skills'] = experts_df.allTags.apply(lambda x: extract_skills(x))
    expert_skills_list = experts_df.expert_skills.to_list()
    
    return expert_skills_list


#Skills list for actors_df
def create_tasks_skills_list():
    tasks_df['tasks_skills'] = tasks_df.allTags.apply(lambda x: extract_skills(x))
    tasks_skills_list = tasks_df.tasks_skills.to_list()
    
    return tasks_skills_list

In [None]:
expSkills = create_expert_skills_list()
taskSkills = create_tasks_skills_list()

In [None]:
#Save to disk
bibsonomy_outpath = '/usr3/graduate/kvombat/Team-Formation/Datasets/bibsonomy_datasets/'

experts_filename = 'bibsonomy_experts_'+ str(paper_year_bibsonomy) + '.txt'
with open(bibsonomy_outpath + experts_filename, 'w') as f:
    f.write(json.dumps(expSkills))
    
tasks_filename = 'bibsonomy_tasks_'+ str(paper_year_bibsonomy) + '.txt'
with open(bibsonomy_outpath + tasks_filename, 'w') as f:
    f.write(json.dumps(taskSkills))

### Test Read in final bibsonomy Data

In [None]:
read_path = '/usr3/graduate/kvombat/Team-Formation/Datasets/bibsonomy_datasets/'

with open(read_path + 'bibsonomy_experts.txt', 'r') as f:
    expert_skills_list = json.loads(f.read())
    
with open(read_path + 'bibsonomy_tasks.txt', 'r') as f:
    task_skills_list = json.loads(f.read())
    
print("Num Experts={}, Num Tasks={}".format(len(expert_skills_list),len(task_skills_list)))