# Data Acquisition Code

<br>
This code includes the necessary comand to acquire the data from the TMDB database based on the Python wrapper library of the TMDB API


<br>

## A - Data Acquisition


<br>

### Importing Modules


<br>

In [4]:
from pypandoc.pandoc_download import download_pandoc
import requests
import pypandoc
import json
import time
import itertools
import wget
import os
import png
import pickle
import numpy as np
import tmdbsimple as tmdb
import time
import sys
import urllib
from PIL import Image

import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import precision_recall_curve
import scipy

sns.set_style('white')
import pandas as pd


### Importing ID File and Genres List

<br>

In [3]:
# Importing Data file
idsdf = pd.read_csv('randomID2.csv').iloc[4000:8000]

# Importing Genres List
gen_list = pd.read_csv('genres_TMDB.csv')

# Printing a sample data
idsdf.head()

Unnamed: 0,imdb,tmdb
4000,tt1286809,16277
4001,tt0102351,11175
4002,tt0986233,10360
4003,tt1016307,25656
4004,tt0103016,10261


### Acquiring from TMDB:


<br>

#### 1- Specifying the IDs and length


<br>

In [3]:
# Id Number
ids = idsdf.tmdb.values

# Length of Data
Len = len(ids)

#### 2- Requesting the data

<br>
The data to be collected is:

 - Title
 - Budget
 - Overview
 - Revenue
 - Runtime
 - Poster Path
 - Genres
 - Release Year
 - Movie Poularity
 
 
<br>

In [4]:
# Accessing the TMDB
tmdb.API_KEY = '95b208af4c92eb1ddd5c831f91c84e15'

# Initiate the data Lists
t_tmdb   = []    # title
b_tmdb   = []    # budget
o_tmdb   = []    # overview
rv_tmdb  = []    # revenue
rt_tmdb  = []    # runtime
p_tmdb   = []    # poster path
g_tmdb   = []    # Genres
y_tmdb   = []    # release year
pop_tmdb = []    # Popularity

# initiate counters
i = 0

for i in range(Len):
    
    # Specify the movie id
    movie = ids[i]
    
    # get the movie 
    mv = tmdb.Movies(movie)
    response = mv.info()
    
    # Append to the data based on availability of a poster
    if mv.poster_path:
        
        t_tmdb.append(mv.title.encode('utf-8' , 'strict'))
        b_tmdb.append(float(mv.budget))
        o_tmdb.append(mv.overview.encode('utf-8' , 'strict'))
        rv_tmdb.append(float(mv.revenue))
        rt_tmdb.append(float(mv.runtime))
        p_tmdb.append(mv.poster_path.encode('utf-8' , 'strict'))
        g_tmdb.append(mv.genres)
        y_tmdb.append(str(mv.release_date[:4]))
        pop_tmdb.append(float(mv.popularity))
        
    # Append an empty value otherwise        
    else:
        t_tmdb.append("")
        b_tmdb.append(0)
        o_tmdb.append("")
        rv_tmdb.append(0)
        rt_tmdb.append(0)
        p_tmdb.append("")
        g_tmdb.append("")
        y_tmdb.append("")
        pop_tmdb.append(0)
        
    
    # Pause
    time.sleep(1)
    
    
    
    # Update Results
    sys.stdout.flush()
    sys.stdout.write("\r Step: {}  id {}  acquired : {} ".format(i, movie , len(y_tmdb)))
        


 Step: 1999  id 28788  acquired : 2000 

In [5]:
len(g_tmdb)

2000





<br>
** In order to get information about the keywords and the director seperate code cells will be used since they belong o different sections of the database and requires a seperate call for each **




<br>



<br>
#### 3 - Getting the Keywords

<br>

In [6]:

# Initiate the data Lists
k_tmdb   = []    # director


# initiate counters
i = 0

for i in range(Len):
    
    # Specify the movie id
    movie = ids[i]
    
    # get the movie 
    mv = tmdb.Movies(movie)
    response = mv.keywords()
    
    # get the keywords
    keywrds = mv.keywords
    
    # Check for availability
    if len(keywrds) > 0:
        
        # Extract the list
        k = []
    
        # iterrate for the length of the keywords
        for j in range (len(keywrds)):
        
            # Append to the list
            k.append(keywrds[j]['name'].encode('utf-8' , 'strict'))
    
        # Append to the data list
        k_tmdb.append(k)
        
    else:
        
        k_tmdb.append("")
    
    # Pause
    time.sleep(1)
    
    
    # Update Results
    sys.stdout.flush()
    sys.stdout.write("\r Step: {}  acquired : {} ".format(i, len(k_tmdb)))
        


 Step: 1999  acquired : 2000 



<br>
#### 3 - Director

<br>

In [8]:

# Initiate the data Lists
d_tmdb   = []    # director



for i in range(Len):
    
    # Specify the movie id
    movie = ids[i]
    
    # get the movie 
    mv = tmdb.Movies(movie)
    response = mv.credits()
    
    # get the crew
    crew = mv.crew
    
    # Check for availability
    if len(crew) > 0:
        
        direc = ""
    
        # iterrate for the length of the keywords
        for j in range (len(crew)):
            
            # Search for director
            if crew[j]['job'] == "Director":
                
                direc = crew[j]['name'].encode('utf-8' , 'strict')
    
        # Append to the data list
        d_tmdb.append(direc)
        
    else:
        
        d_tmdb.append("")
    
    # Pause
    time.sleep(1)
    
    
    # Update Results
    sys.stdout.flush()
    sys.stdout.write("\r Step: {}  acquired : {} ".format(i, len(d_tmdb)))
        


 Step: 1999  acquired : 2000 



<br>
### Constructing the features outpout Frame

<br>

In this section the data will be gathered in a single dataframe to be exported and shared


<br>

In [9]:
# Aggregate the data in a data dictionary
data_dict = {'id' : ids , 'title' : t_tmdb , 'budget': b_tmdb , 'overview' : o_tmdb , 'revenue' : rv_tmdb , 
             'keywords' : k_tmdb , 'runtime' : rt_tmdb , 'poster_path' : p_tmdb , 'releaseyear' : y_tmdb , 
             'popularity' : pop_tmdb , 'director' : d_tmdb , 'genres' : g_tmdb }


# Make the data frame
data_raw = pd.DataFrame(data_dict)

In [10]:
data_raw.head(15)

Unnamed: 0,budget,director,genres,id,keywords,overview,popularity,poster_path,releaseyear,revenue,runtime,title
0,2500000.0,Floyd Mutrux,"[{u'id': 35, u'name': u'Comedy'}]",21879,[teen movie],"Led by their comedic and pranking leader, Newb...",0.178502,/kZoeRbYSKYhFj5s15QCbHbydYTt.jpg,1980,10000000.0,91.0,The Hollywood Knights
1,0.0,Rudolph Maté,"[{u'id': 18, u'name': u'Drama'}, {u'id': 80, u...",27399,"[gangster, macao]",Forbidden bears traces of several earlier film...,0.094302,/rNAqcCtBQO2UmvWx9DkRQGTwM8J.jpg,1953,0.0,85.0,Forbidden
2,10000000.0,Dan Harris,"[{u'id': 35, u'name': u'Comedy'}, {u'id': 18, ...",25350,"[suicide, suicide mission]","Matt Travis is good-looking, popular, and his ...",0.231905,/sR651HaKsdn0lg9LOs1BQGKSY4e.jpg,2004,0.0,111.0,Imaginary Heroes
3,22000000.0,David R. Ellis,"[{u'id': 27, u'name': u'Horror'}, {u'id': 878,...",19116,[suspense],The teenager Madison McBride is traumatized by...,0.165075,/jAD5ltU8KNcmfurmNsaMox7n5Ab.jpg,2008,0.0,93.0,Asylum
4,0.0,Alfred L. Werker,"[{u'id': 80, u'name': u'Crime'}, {u'id': 9648,...",17481,"[jewel, tower of london, chinchilla foot, play...",Professor Moriarity has a scheme for stealing ...,0.586695,/6CMJC5FlagO9HM4YT5W3df9k5Nh.jpg,1939,0.0,85.0,The Adventures of Sherlock Holmes
5,0.0,Sidney J. Furie,"[{u'id': 18, u'name': u'Drama'}, {u'id': 28, u...",14149,,Four high school football stars enlist in the ...,0.034614,/ggINMOcBPxoHZOehtPP3iRnxZRd.jpg,2008,0.0,118.0,The Four Horsemen
6,0.0,Albert Pyun,"[{u'id': 28, u'name': u'Action'}, {u'id': 12, ...",13945,"[prince, princess, sword fight, king, barbaria...",A mercenary with a three-bladed sword rediscov...,0.263891,/m0rAw4P71AyAGaMxCiaBdnz8hRA.jpg,1982,0.0,100.0,The Sword and the Sorcerer
7,0.0,Howard Hall,"[{u'id': 99, u'name': u'Documentary'}]",17700,"[octopus, sea turtle, imax, shrimp, manta ray,...","Sea life in a whole new way. Deep Sea 3D, an u...",0.35755,/2UglT7JjN6o7dYni4G16NYOFRJO.jpg,2006,0.0,41.0,Deep Sea 3D
8,0.0,Mark Hicks,"[{u'id': 28, u'name': u'Action'}, {u'id': 35, ...",16287,,No overview found.,0.00058,/ezYuqtbdYNZEn4J1admwa6b9Gni.jpg,2009,0.0,88.0,Afro Ninja
9,5000.0,Josh Bernhard,"[{u'id': 18, u'name': u'Drama'}]",27806,"[date, blind date, woman between two men, pira...",Nick (Mike Pantozzi) gets his girls the same w...,0.001179,/8G9pbb7pAC22OOXcPJETc2s6WJb.jpg,2009,0.0,66.0,The Lionshare




<br>

### Exporting the Data to csv Files


<br>


In [11]:
# Genresdf.to_csv('Genres_labels_79.csv')
data_raw.to_csv('Meta_data_24.csv')



<br>
## END