# Source Code
Includes: cleaning of conversations data, scraping of Bechdel test data, merging of two datasets to usable set, and descriptions/comments for original reference.

*This code is only specific to what is included in the final draft; all other information is stached in **archive-code.ipynb**.*

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

import json
import csv

from sklearn.linear_model import LogisticRegression
import seaborn
from matplotlib import pyplot

import duckdb, sqlalchemy

## Data Cleaning & Scraping

### Conversations Dataset
*Includes movie ID, movie name, release year, rating, votes, & genre.*

In [4]:
conversations_df = pd.read_csv("movie-corpus/conversations.csv")
conversations_df.dropna()
conversations_df = conversations_df.rename(columns={
    '_key':'conversation_id',
    'meta/movie_idx':'movie_id',
    'meta/movie_name':'movie_name',
    'meta/release_year':'release_year',
    'meta/rating':'rating',
    'meta/votes':'imbd_votes',
    'meta/genre':'genre'
})
conversations_df.head()

Unnamed: 0,conversation_id,movie_id,movie_name,release_year,rating,imbd_votes,genre
0,L1044,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,L984,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
2,L924,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
3,L870,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
4,L866,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"


In [6]:
conversations_df = conversations_df[['movie_name', 'release_year', 'rating', 'imbd_votes', 'genre']]
conversations_df = conversations_df.drop_duplicates(subset=['movie_name'], keep='first')
conversations_df.head()

Unnamed: 0,movie_name,release_year,rating,imbd_votes,genre
0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
201,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
294,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
472,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
545,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


### Scraping + Cleaning
*Scraping from https://bechdeltest.com/?list=all to collect a list of movies and the binary value for whether or not each passes the Bechdel Test.*

In [7]:
import requests
from bs4 import BeautifulSoup

In [8]:
list_url = 'https://bechdeltest.com/?list=all'
list_result = requests.get(list_url)
list_page = BeautifulSoup(list_result.text, 'html.parser')

In [12]:
findMovies = list_page.find_all('div', {'class':'movie'})[:]

#make movie name array
movie_names = []
for x in findMovies:
    movie_names.append((x.text).replace('\n', '').replace(' ', '').lower())
print(len(movie_names))


#make pass/fail array
findPass = list_page.find_all('div', {'class':'list'})[0]
movie_pass = []
for x in findPass.find_all('img'):
    pass_fail = x['src']
    if pass_fail == '/static/pass.png':
            movie_pass.append(1)
    if pass_fail == '/static/nopass.png':
            movie_pass.append(0)
print(len(movie_pass))

#make dataframe with movie name + pass/fail metric
columns = ['movie', 'bechdel_pass']
bechdel_df = pd.DataFrame(columns=columns)
bechdel_df['movie'] = movie_names
bechdel_df['bechdel_pass'] = movie_pass
bechdel_df.head(n=10)

9630
9630


Unnamed: 0,movie,bechdel_pass
0,the355,1
1,theadamproject,0
2,alltheoldknives,1
3,allrasíðastaveiðiferðin,0
4,ambulance,0
5,thebadguys,1
6,badhaaido,1
7,thebatman,1
8,beavisandbutt-headdotheuniverse,0
9,blasted,1



## Merging Bechdel Tests & Movie Stats
Merge the new bechdel dataframe with the necessary movie stats.<br>
<br>
<em>Concerns and considerations:</em>
<ul>
    <li>The ConvoKit dataset only went through 2011 movies, while the Bechdel Scores are heavily focused in the present, so 
        while the merge will get rid of any non-overlapped films I want to make sure there is still a statistically valuable 
        amount of data.</li>
    <li>The titles of the films need to match exactly, so I want to make sure any differences in how the titles are presented 
        are covered.</li>    
</ul>

In [13]:
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [14]:
conversations_df = conversations_df[['movie_name', 'release_year', 'rating', 'imbd_votes', 'genre']]
conversations_df = conversations_df.drop_duplicates(subset=['movie_name'], keep='first')
#display(conversations_df)
#display(bechdel_df)

joined_movies = %sql SELECT conversations_df.movie_name, conversations_df.release_year, conversations_df.rating, conversations_df.imbd_votes, conversations_df.genre, bechdel_df.bechdel_pass FROM conversations_df INNER JOIN bechdel_df ON conversations_df.movie_name = bechdel_df.movie
joined_movies = %sql SELECT * FROM joined_movies ORDER BY release_year
display(joined_movies)

bechdel_movies = joined_movies.to_csv('bechdel_movies.csv')
print(bechdel_movies)

Unnamed: 0,movie_name,release_year,rating,imbd_votes,genre,bechdel_pass
0,metropolis,1927,8.4,40730,"['adventure', 'drama', 'sci-fi']",0
1,frankenstein,1931,8.0,23522,"['drama', 'horror', 'sci-fi']",0
2,vampyr,1932,7.6,4005,"['fantasy', 'horror']",1
3,ninotchka,1939,7.9,6951,"['comedy', 'romance']",1
4,casablanca,1942,8.8,170874,"['drama', 'romance', 'war']",0
...,...,...,...,...,...,...
121,crash,2004/I,8.0,174003,"['crime', 'drama']",1
122,domino,2005,5.9,32949,"['action', 'crime', 'drama', 'thriller']",1
123,slither,2006,6.6,26497,"['comedy', 'horror', 'sci-fi']",1
124,juno,2007,7.9,152436,"['comedy', 'drama']",1


None
