## PREPARE

In [1]:

import requests,os,time
def ratelimit():
    "A function that handles the rate of your calls."
    time.sleep(0.5) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
    """This Class implements a method for reliable connection to the internet and monitoring. 
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
            
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit()
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\n'+';'.join(map(str,row))) # write log.
          self.log.flush()
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\n'+';'.join(map(str,row))) # write row to log.
          self.log.flush()
    else:
      t = time.time()
      ratelimit()
      self.browser.get(url) # use selenium get method
      ## log
      call_id = self.id # define unique identifier for the call. 
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
      self.log.flush()
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
    ## connector.browser.page_source will give you the html.
      return None,call_id

In [2]:
connector = Connector('Super Log')
from bs4 import BeautifulSoup
import seaborn as sns
import pandas as pd
import numpy as np
import nltk
import requests
import nltk, nltk.sentiment, sklearn
%matplotlib inline
name=nltk.corpus.names

## NAME LIST

In [3]:
# SCRABE FEMALE FIRST NAMES AND OTHERS
first_female=[]
sur_female=[]
full_f=[]
for i in range(1,8):
    url='https://imdb.com/list/ls022928836/?sort=list_order,asc&mode=detail&page={}'.format(i)
    call='Exam, KU, Female Page {}'.format(i)
    response,call_id = connector.get(url, call)
    html = response.text
    soup=str(html)
    if i < 7:
        for i in range(1,101):
            first_female.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[1]))
            sur_female.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[2:]))
            full_f.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].lstrip()))
    if i == 7:
        for i in range(1,7):
            first_female.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[1]))
            sur_female.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[2:]))
            full_f.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].lstrip()))
# SCRABE MALE FIRST NAMES AND OTHERS
first_male=[]
sur_male=[]
full_m=[]
for i in range (1,8):
    url='https://www.imdb.com/list/ls022928819/?sort=list_order,asc&mode=detail&page={}'.format(i)
    call='Exam, KU, Male Page {}'.format(i)
    response,call_id = connector.get(url, call)
    html = response.text
    soup=str(html)
    if i < 7:
        for i in range(1,101):
            first_male.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[1]))
            sur_male.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[2:]))
            full_m.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].lstrip()))
    if i == 7:
        for i in range(1,82):
            first_male.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[1]))
            sur_male.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].split(' ')[2:]))
            full_m.append((soup.split('div class="lister-list"')[1].split('h3 class')[i].split('</span>\n<a href=')[1].split('>')[1].split('\n')[0].lstrip()))

first_m=list(dict.fromkeys(first_male))
first_f=list(dict.fromkeys(first_female))

# REMOVING DUPLICATES BY MAKING INTO DICT AND BACK AGAIN

male_name=name.words('male.txt')+first_m
first_m_done=list(dict.fromkeys(male_name))

female_name=name.words('female.txt')+first_f
first_f_done=list(dict.fromkeys(female_name))

# SCRABE FEMALE DIRECTOR NAMES
female_name_director = [] 
url = 'https://www.imdb.com/list/ls003532091/?fbclid=IwAR36QhJh8EDYKPCh_9RTsJdkvjNf-c-xQxtGy8bUZz1E2HefYpzxWeFA5w0'
call = 'Exam, KU{}'.format(i)
response,call_id = connector.get(url, call)
if response.ok:
    html = response.text
else:
    print('error')
soup= BeautifulSoup(html,'html.parser')
    
for j in range(0,100): 
    tree_node = soup.findAll('h3')[j]
    name = tree_node.text.split('\n')[2].strip()
    female_name_director.append(name)

# SCRABE MALE DIRECTOR NAMES
male_name_director = []
url = 'https://www.imdb.com/list/ls058727091/'
call = 'Exam, KU{}'.format(i)
response,call_id = connector.get(url, call)
if response.ok:
    html = response.text
else:
    print('error')
soup= BeautifulSoup(html,'html.parser')    
for j in range(0,100): 
    tree_node = soup.findAll('h3')[j]
    name = tree_node.text.split('\n')[2].strip()
    male_name_director.append(name)

# Gendering neutral names and removing
names='C:/Users/frede/OneDrive/Documents/GitHub/Group/Group-31-SoDa/Data/sorted_names.csv'
sorted_name=pd.read_csv(names, sep=';')
fem_sort=list(sorted_name['F'].str.lstrip())
mal_sort=list(sorted_name['M'].str.lstrip())
neu_sort=list(sorted_name['N'].str.lstrip())
fem_remove = mal_sort + neu_sort
mal_remove = fem_sort + neu_sort
sort_f=[x for x in first_f_done if x not in fem_remove]
sort_m=[x for x in first_m_done if x not in mal_remove]

## SCRAPING

In [4]:
# Create Lists
gross=[]
rating=[]
movie_stars=[]
director=[]
year=[]
name=[]
genre=[]
summary=[]

for v in range (1,201):
    url = 'https://www.imdb.com/search/keyword/?ref_=kw_ref_yr&mode=detail&page={}&title_type=movie&fbclid=IwAR3B7G9VdhKWjVvbFIPhdH9vGLZwmO_zzTeNlCj4whUMbn_RtS3g1g9FiUQ&release_date=1980%2C2019&sort=num_votes,desc'.format(v)
    response,callid = connector.get(url,'Exam')
    html = response.text
    soup = BeautifulSoup(html,'lxml')
    print(v)
    for j in range(50):
        box=soup.findAll('div',{"class":'lister-item mode-detail'})[j]
# BOX OFFICE
        if len(box.findAll("span", {"name": "nv"})) < 2:
            gross.append(None)
        if len(box.findAll("span", {"name": "nv"})) == 2:
            gross.append(box.findAll("span", {"name": "nv"})[1].text)
# Rating
        if box.findAll("div", {"class": "inline-block ratings-imdb-rating"})==[]:
            rating.append(None)
        else:
            primo=box.findAll("div", {"class": "inline-block ratings-imdb-rating"})[0].text
            rating.append(primo.split('\n')[2])
# Movie Stars
        if box.findAll("p", {"class":"text-muted text-small"})== []:
            movie_stars.append('NaN')
        else: 
            stars = box.findAll("p", {"class":"text-muted text-small"})[1].text.strip()
            if len(stars.split(':'))<3:
                movie_stars.append('NaN')
            else:
                stars_1 = stars.split(':')[2].split('\n')[1:]
                movie_stars.append(stars_1)
# Directors
        di=box.findAll("p", {"class": "text-muted text-small"})[1].text
        if len(di.split('Stars')[0].split('\n')) > 5:
            director.append(di.split('Stars')[0].split('\n')[2:4])#+primo.split('Stars')[0].split('\n')[3])
        else:
            director.append(di.split('Stars')[0].split('\n')[2])
# Years
        headline_j = soup.findAll('h3')[j] # search for the first headline: h1 tag. 
        #name = headline_i['class'][0].strip() # use the class attribute name as column name.
        value = headline_j.text.strip() # extract text using build in method        
        film = value.split('\n')[2]
        if len(film)> 6:
            film = value.split(' ')[-1]
        year.append(film[1:5])
# Name
        headline_j = soup.findAll('h3')[j] # search for the first headline: h1 tag. 
        #name = headline_i['class'][0].strip() # use the class attribute name as column name.
        value = headline_j.text.strip() # extract text using build in method.
        film = value.split('\n')[1]
        name.append(film)
# Genre
        if box.findAll("span", {"class":"genre"}) ==[]:
            genre.append(None)
        else: 
            movie_genre = box.findAll("span", {"class":"genre"})[0].text.strip()
            genre.append(movie_genre)
# Summary
        if box.findAll("p", {"class":""})== []:
            summary.append(None)
        else: 
            movie_summary = box.findAll("p", {"class":""})[0].text.strip()
            summary.append(movie_summary)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


In [5]:
print(len(gross))
print(len(rating))
print(len(movie_stars))
print(len(director))
print(len(year))
print(len(name))
print(len(genre))
print(len(summary))

10000
10000
10000
10000
10000
10000
10000
10000


In [6]:
df_name=pd.DataFrame(name)
df_year=pd.DataFrame(year)
df_genre=pd.DataFrame(genre)
df_rating=pd.DataFrame(rating)
df_gross=pd.DataFrame(gross)
df_director=pd.DataFrame(director)
df_stars=pd.DataFrame(movie_stars)
df_summary=pd.DataFrame(summary)
df_name['year']=df_year
df_name['genre']=df_genre
df_name['rating']=df_rating
df_name['gross']=df_gross
df_name['director']=df_director
df_name['summary']=df_summary
df_name['Lead']=df_stars[0]
df_name['Star 2']=df_stars[1]
df_name['Star 3']=df_stars[2]
df_name['Star 4']=df_stars[3]
done=df_name
done.columns=['Title','Year','Genre','Rating','Gross','Director','Summary','Lead','Star 2','Star 3','Star 4']

In [7]:
done[1980:1987]

Unnamed: 0,Title,Year,Genre,Rating,Gross,Director,Summary,Lead,Star 2,Star 3,Star 4
1980,Sex Drive,2008,"Comedy, Romance",6.5,$8.40M,Sean Anders,A high school senior drives cross-country with...,"Josh Zuckerman,","Clark Duke,","Amanda Crew,",James Marsden
1981,Goon,2011,"Comedy, Drama, Sport",6.8,$4.17M,Michael Dowse,"Labeled an outcast by his brainy family, a bou...","Seann William Scott,","Jay Baruchel,","Alison Pill,",Eugene Levy
1982,Skønheden i alting,2016,"Drama, Romance",6.8,$30.98M,David Frankel,"Retreating from life after a tragedy, a man qu...","Will Smith,","Edward Norton,","Kate Winslet,",Michael Peña
1983,Cop Land,1997,"Crime, Drama, Thriller",6.9,$44.89M,James Mangold,The Sheriff of a suburban New Jersey community...,"Sylvester Stallone,","Harvey Keitel,","Ray Liotta,",Robert De Niro
1984,Creed II,2018,"Drama, Sport",7.2,$115.72M,Steven Caple Jr.,"Under the tutelage of Rocky Balboa, newly crow...","Michael B. Jordan,","Sylvester Stallone,","Tessa Thompson,",Phylicia Rashad
1985,Mississippi burning,1988,"Crime, Drama, History",7.8,$34.60M,Alan Parker,"Two F.B.I. Agents, with wildly different style...","Gene Hackman,","Willem Dafoe,","Frances McDormand,",Brad Dourif
1986,The Last House on the Left,2009,"Horror, Thriller",6.5,$32.75M,Dennis Iliadis,After kidnapping and brutally assaulting two y...,"Garret Dillahunt,","Monica Potter,","Tony Goldwyn,",Michael Bowen


## DATA CLEANING

In [14]:
# STARS
done['Lead']=done['Lead'].str.replace(',','')
done['Star 2']=done['Star 2'].str.replace(',','')
done['Star 3']=done['Star 3'].str.replace(',','')
# GROSS
done["Gross"] = done["Gross"].str.replace("M","")
done["Gross"] = done["Gross"].str.replace("$","")
done["Gross"] = done['Gross']
# GENRE
done["Genre"] = pd.Series(done["Genre"])
new1 = done["Genre"].str.split(",", expand=True)
done["Genre 1"]= new1[0] 
done["Genre 2"]= new1[1]
done["Genre 3"]= new1[2]
# DIRECTOR
for i in range(10000):
    if type(done['Director'][i]) != str:
        done['Director'][i]=str(done['Director'][i])
    else:
        pass


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Gendering the actors & director

In [15]:
# Counting whether a male or female name appears
tokenizer = nltk.tokenize.TweetTokenizer()

def preprocessing(string):
    token=tokenizer.tokenize(string)
    return token
def count_dictionary(dic,doc):
    doc_new=[ x for x in doc if x in dic]
    return len(doc_new)

done['m_count']=done['Lead'].apply(preprocessing).apply(count_dictionary, doc=sort_m)
done['f_count']=done['Lead'].apply(preprocessing).apply(count_dictionary, doc=sort_f)
done['star_m']=done['Lead'].apply(count_dictionary, doc=full_m)
done['star_f']=done['Lead'].apply(count_dictionary, doc=full_f)
done['d_m_count']=done['Director'].apply(preprocessing).apply(count_dictionary, doc=sort_m)
done['d_f_count']=done['Director'].apply(preprocessing).apply(count_dictionary, doc=sort_f)
done['d_star_m']=done['Director'].apply(count_dictionary, doc=male_name_director)
done['d_star_f']=done['Director'].apply(count_dictionary, doc=female_name_director)

In [16]:
# Ensuring no properly gendered person is Neutral, and identify non-assigned. Stars given true gender
# Gender-matching, actor
done.loc[(done.m_count > 0), 'Male'] = 1
done.loc[(done.f_count > 0), 'Female'] = 1
done.loc[(done.f_count > 0) & (done.m_count > 0), 'Neutral'] = 1
done.loc[(done.star_m) ==1, 'Neutral'] = 0
done.loc[(done.star_m) ==1, 'Female'] = 0
done.loc[(done.star_m) ==1, 'Male'] = 1
done.loc[(done.star_f) ==1, 'Neutral'] = 0
done.loc[(done.star_f) ==1, 'Male'] = 0
done.loc[(done.star_f) ==1, 'Female'] = 1
done.loc[(done.m_count==0) & (done.f_count==0) & (done.star_m==0) & (done.star_f==0), 'Non'] = 1
done.loc[done.Neutral==1, 'Male'] = 0
done.loc[done.Neutral==1, 'Female'] = 0


# Gender-matching, director
done.loc[(done.d_m_count > 0), 'd_Male'] = 1
done.loc[(done.d_f_count > 0), 'd_Female'] = 1
done.loc[(done.d_f_count > 0) & (done.d_m_count > 0), 'd_Neutral'] = 1
done.loc[(done.d_star_m) ==1, 'd_Neutral'] = 0
done.loc[(done.d_star_m) ==1, 'd_Female'] = 0
done.loc[(done.d_star_m) ==1, 'd_Male'] = 1
done.loc[(done.d_star_f) ==1, 'd_Neutral'] = 0
done.loc[(done.d_star_f) ==1, 'd_Male'] = 0
done.loc[(done.d_star_f) ==1, 'd_Female'] = 1
done.loc[done.d_Neutral==1, 'd_Male'] = 0
done.loc[done.d_Neutral==1, 'd_Female'] = 0
done.loc[(done.d_m_count==0) & (done.d_f_count==0) & (done.d_star_m==0) & (done.d_star_f==0), 'd_Non'] = 1
done.loc[(done.d_Neutral==1) | (done.d_Non==1), 'd_Drop'] = 1 # Create variable so that non-gendered directors can be removed

In [25]:
# Remove Neutral and Non and clean up
done_1=done.drop(columns=['star_f','star_m','m_count','f_count','d_m_count','d_f_count','d_star_m','d_star_f'])
done_1.head(1)
done_1=done_1[done_1.Neutral != 1]
done_1=done_1[done_1.Non != 1]
done_1=done_1.reset_index()


done_true=done_1.drop(columns=['index','Male','Neutral','Non','d_Male','d_Neutral','d_Non','Star 2','Star 3','Star 4'])
done_true['Female'].fillna(0, inplace= True)
done_true['d_Female'].fillna(0, inplace= True)
done_true['d_Drop'].fillna(0, inplace= True)

# DIRECTOR CLEAN

for e in range(8262):
    if "[" in done_true["Director"][e]:
        done_true["Director"][e] = done_true["Director"][e].split(",")[0].replace("[","").replace("'","")
    else:
        pass

abs_path='C:/Users/frede/OneDrive/Documents/GitHub/Group/Group-31-SoDa/Data/test.csv'
done_true.to_csv(abs_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
