# Combining Data from Database Into CSV

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from config import login

In [2]:
# Establishing connection

db_url = 'postgresql://' + login + '@localhost:5432/movie_db'
engine = create_engine(db_url)
connection = engine.connect()

In [3]:
tag_query = f"select movie_id, tag from tags;"

tags = pd.read_sql(tag_query, connection)

tags.head()

Unnamed: 0,movie_id,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good


In [4]:
# Iterrows to convert each tag to lowercase without spaces
for index, row in tags.iterrows():
    tag_converted = row['tag'].lower().replace(" ", "")
    tags.iloc[index, 1] = tag_converted
    

0 classic classic
1 sci-fi sci-fi
2 dark comedy darkcomedy
3 great dialogue greatdialogue
4 so bad it's good sobadit'sgood
5 unreliable narrators unreliablenarrators
6 tense tense
7 artificial intelligence artificialintelligence
8 philosophical philosophical
9 tense tense
10 so bad it's good sobadit'sgood
11 cliche cliche
12 musical musical
13 horror horror
14 unpredictable unpredictable
15 Oscar (Best Supporting Actress) oscar(bestsupportingactress)
16 adventure adventure
17 anime anime
18 ecology ecology
19 fantasy fantasy
20 Hayao Miyazaki hayaomiyazaki
21 Miyazaki miyazaki
22 post-apocalyptic post-apocalyptic
23 bah bah
24 Clint Eastwood clinteastwood
25 music music
26 art art
27 contemporary art contemporaryart
28 documentary documentary
29 aliens aliens
30 amazing photography amazingphotography
31 Director: James Cameron director:jamescameron
32 first contact firstcontact
33 James Cameron jamescameron
34 Michael Biehn michaelbiehn
35 sci-fi sci-fi
36 android(s)/cyborg(s) android(

KeyboardInterrupt: 

In [22]:
tags.head()

Unnamed: 0,movie_id,tag
0,60756,funny
1,60756,highlyquotable
2,60756,willferrell
3,89774,boxingstory
4,89774,mma


In [41]:
# Now we need to combine the tags into one column based on movie_id

# First create a new dataframe to store the tags and movie_ids
tags_combined = pd.DataFrame(columns=["movie_id", "tags"])
tags_combined.head()

Unnamed: 0,movie_id,tags


In [42]:
# Iterrows for tags df
for index, row in tags.iterrows():
    # Check if movie_id already exists in tags_combined
    movie_exist = tags_combined.loc[tags_combined['movie_id']==row['movie_id'], :]
    #movie_exist.head()
    if movie_exist.empty:
        # Add new movie row
        new_row = {'movie_id': row['movie_id'], 'tags': row['tag']}
        #print (new_row)
        tags_combined = tags_combined.append(new_row, ignore_index=True)
    else:
        #print("Movie ID exists! Add to row")
        
        # Combine tags
        new_tags = movie_exist['tags'] + " " + row['tag']
        # Update row
        tags_combined.loc[tags_combined['movie_id']==row['movie_id'], 'tags'] = new_tags
        
tags_combined.head()

Unnamed: 0,movie_id,tags
0,60756,funny highlyquotable willferrell comedy funny ...
1,89774,boxingstory mma tomhardy
2,106782,drugs leonardodicaprio martinscorsese stockmar...
3,48516,waytoolong leonardodicaprio suspense twistendi...
4,431,alpacino gangster mafia


In [43]:
# Save combined_tags to CSV
tags_combined.to_csv("movies_csv/tags_combined.csv", index=False)

In [46]:
# Read genres from DB so we can merge genres with the tags

genre_query = f"select movie_id, genre from movies;"

genre = pd.read_sql(genre_query, connection)

genre.head()

Unnamed: 0,movie_id,genre
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


In [47]:
# Iterrows to convert each genre to lowercase, swap | with a space
for index, row in genre.iterrows():
    genre_converted = row['genre'].lower().replace("|", " ")
    genre.iloc[index, 1] = genre_converted
    #print (index, row['genre'], genre_converted)
    
genre.head()

Unnamed: 0,movie_id,genre
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy
2,3,comedy romance
3,4,comedy drama romance
4,5,comedy


In [58]:
# Merge genre and tags_converted on movie_id

combined_df = genre.merge(tags_combined, on="movie_id", how="outer").fillna("")
combined_df.head()

Unnamed: 0,movie_id,genre,tags
0,1,adventure animation children comedy fantasy,pixar pixar fun
1,2,adventure children fantasy,fantasy magicboardgame robinwilliams game
2,3,comedy romance,moldy old
3,4,comedy drama romance,
4,5,comedy,pregnancy remake


In [59]:
# Create new bag_of_words df that combines genre and tags

bag_of_words = pd.DataFrame(columns=["movie_id", "bag_of_words"])
bag_of_words.head()

Unnamed: 0,movie_id,bag_of_words


In [60]:
# Iterrows for combined_df
for index, row in combined_df.iterrows():
    # Combine genre and tags
    bag = row['genre'] + " " + row['tags']
    
    new_row = {'movie_id': row['movie_id'], 'bag_of_words': bag}
    
    # Append row
    bag_of_words = bag_of_words.append(new_row, ignore_index=True)
        
bag_of_words.head()

Unnamed: 0,movie_id,bag_of_words
0,1,adventure animation children comedy fantasy pi...
1,2,adventure children fantasy fantasy magicboardg...
2,3,comedy romance moldy old
3,4,comedy drama romance
4,5,comedy pregnancy remake


In [61]:
# Save bag_of_words to CSV
bag_of_words.to_csv("movies_csv/bag_of_words.csv", index=False)

In [62]:
# Read movie title from DB so we can separate title from year, then merge with bag_of_words

movie_query = f"select movie_id, title from movies;"

movies = pd.read_sql(movie_query, connection)

movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [64]:
# Merge movies and bag_of_words on movie_id

movies_bag = movies.merge(bag_of_words, on="movie_id", how="outer").fillna("")
movies_bag.head()

Unnamed: 0,movie_id,title,bag_of_words
0,1,Toy Story (1995),adventure animation children comedy fantasy pi...
1,2,Jumanji (1995),adventure children fantasy fantasy magicboardg...
2,3,Grumpier Old Men (1995),comedy romance moldy old
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy pregnancy remake


In [65]:
# Save movies_bag to CSV
movies_bag.to_csv("movies_csv/movies_bag.csv", index=False)

## Split the title and year

In [85]:
# Create new movie_year df so we can split title and year

movie_year = pd.DataFrame(columns=["movie_id", "title", "year"])
movie_year.head()

Unnamed: 0,movie_id,title,year


In [66]:
# Import regular expression because there are titles with (words) in the name and we need to make sure 
# we split it at (year)
import re

In [86]:
# Iterrows for movies
for index, row in movies.iterrows():
    # Combine genre and tags
    to_split = row['title']
    title_split = re.split(r'[ ](?=\([1-2][0-9][0-9][0-9])', to_split)
    title = title_split[0]
    if len(title_split)>1:
        year = title_split[1].replace("(", "").replace(")", "") # Then strip () from string
    else:
        print (f"No year for {row['title']}")
        year = ""
    
    new_row = {'movie_id': row['movie_id'], 'title': title, 'year': year}
    #print(new_row)
    
    # Append row
    movie_year = movie_year.append(new_row, ignore_index=True)
    
    #if index == 3:
    #    break
        
movie_year.head()

No year for Babylon 5
No year for Ready Player One
No year for Hyena Road
No year for The Adventures of Sherlock Holmes and Doctor Watson
No year for Nocturnal Animals
No year for Paterson
No year for Moonlight
No year for The OA
No year for Cosmos
No year for Maria Bamford: Old Baby
No year for Generation Iron 2
No year for Black Mirror


Unnamed: 0,movie_id,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [87]:
movie_year.tail()

Unnamed: 0,movie_id,title,year
9737,193581,Black Butler: Book of the Atlantic,2017
9738,193583,No Game No Life: Zero,2017
9739,193585,Flint,2017
9740,193587,Bungo Stray Dogs: Dead Apple,2018
9741,193609,Andrew Dice Clay: Dice Rules,1991


In [84]:
test_string = "Babylon 5"
title_split = re.split(r'[ ](?=\([1-2][0-9][0-9][0-9])', test_string)
title = title_split[0]
if len(title_split)>1:
    year = title_split[1]
else:
    year = 0
print(year)

0


In [89]:
# Merge movie_year with bag_of_words on movie_id

movies_bag = movie_year.merge(bag_of_words, on="movie_id", how="outer").fillna("")
movies_bag.head()

Unnamed: 0,movie_id,title,year,bag_of_words
0,1,Toy Story,1995,adventure animation children comedy fantasy pi...
1,2,Jumanji,1995,adventure children fantasy fantasy magicboardg...
2,3,Grumpier Old Men,1995,comedy romance moldy old
3,4,Waiting to Exhale,1995,comedy drama romance
4,5,Father of the Bride Part II,1995,comedy pregnancy remake


In [90]:
# Save movie_year to CSV
movies_bag.to_csv("movies_csv/movies_bag.csv", index=False)