### note: Documentation explaining all of the code is referenced using comments within the respective cells

In [1]:
#Import necssary packages for the code (if needed, install on device)
import os
import json
import pandas as pd
import pprint
import pymongo
from sqlalchemy import create_engine

In [2]:
#Set the necessary credentials-- the host_name to use for connection strings in later functitons, the different
### ports to refer from the jupyter notebook to the device versions of MongoDB and MySQl, the username and 
###password to access/make changes on Mongo and MySQL, as well as the namees for the database names of the
### created mongo and MySQL databasees

host_name = "localhost"
ports = {"mongo" : 27017, "mysql" : 3306}

user_id = "root"
pwd = "%"

src_dbname = "project_1" #what create in mongo
dst_dbname = "project_1_sql" #destination db

In [3]:
#Define useful functions from class notes and lecture:
## "get_sql_dataframe" connects jupyter to the MySQL databse, useful for verifying that a given database was in fact passed
### into MySQL and for later referencing the output of a given query (I used it in the report section to pass new
### queries through python in SQL language to generate general statistics about the data set)

## "get_mongo_dataframe" connects jupyter to MongoDB, extracting a collection or document from Mongo and converting it
### to a pandas dataframe to then manipulate within the python language and eventually send to SQL

## "set_dataframe" connects jupter again to MySQL, though this time uses the DF from the mongo function^^ to send
### that information to SQL into a database into new tables

def get_sql_dataframe(user_id, pwd, host_name, db_name, sql_query):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    conn = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, conn);
    conn.close()
    
    return dframe
def get_mongo_dataframe(user_id, pwd, host_name, port, db_name, collection, query):
    '''Create a connection to MongoDB, with or without authentication credentials'''
    if user_id and pwd:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db_name)
        client = pymongo.MongoClient(mongo_uri)
    else:
        conn_str = f"mongodb://{host_name}:{port}/"
        client = pymongo.MongoClient(conn_str)
    
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query))) #passing in query
    dframe.drop(['_id'], axis=1, inplace=True)
    client.close()
    
    return dframe #gives back data from mongo db query in pandas DF

def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

### Reading in file

Fetch / download / retrieve a remote data file by URL, or ingest a local file
mounted. Suggestions for remote data sources are listed at the end of
this document.

In [4]:
# I cretaed some functions of my own meant to help with converting JSON and CSV files to make use of the previously
## defined class functions. One takes in local files while the other uses links (in this case, links to my personal
## Github site with my Love Island data) to convert one type to the other (CSV to JSON and JSON to CSV). There are doc
## strings in both functions explaining each one in a more detailed way with respect to their parameters

def file_convert(file, file_type, file_path = os.getcwd(), new_file_name = "new_file"):
    """
    use this function to take a file input (CSV or JSON) and convert it to the other to later send to SQL
    
    to use file_convert, you must have LOCAL COPIES of the file you wish to convert; if you have a link-- first use
    the 'link_convert' function
    
    all parametters are type STRING
    'file' is the name of the file BEFORE the file type prefix (either JSON or CSV)
    'file_type' is the kind of file, (csv, json, or link)
    'file_path' is set to working directory, though can be changed to match where you want the file to save
    'new_file name' is set default to just "new file" but can be changed to fit purpose
    
    the function will output the other version of the input file
    """
    try:
        if file_type.lower() == "csv":
            frame = pd.read_csv(file+"."+file_type)
            pathway = (file_path)+"/"+(new_file_name)+".json"
            frame.to_json(path_or_buf = pathway, orient = "records")
        elif file_type.lower() == "json":
            frame = pd.read_json(file+"."+file_type)
            pathway = (file_path)+"/"+(new_file_name)+".csv"
            frame.to_csv(path_or_buf = pathway)
        else:
            print("The file type didn't process correctly. Make sure you are using a JSON or CSV file or a remote link of that file type and that everything is typed in correctly ")
    except:
        print("Either the file name or directory isn't typed in quite right-- make sure the source file exists in that pathway and try again")
        
def link_convert(link, file_type, file_path = os.getcwd(), new_file_name = "new_file"):
    """
    use this function to take a file input (CSV or JSON) FROM A REMOTE LINK to then convert to the 
    other form and later send to SQL
    
    all parametters are type STRING
    'link' is the name of the full URL of the file from the remote source (file type JSON or CSV)
    'file_type' is the kind of file, (CSV or JSON)
    'file_path' is set to working directory, though can be changed to match where you want the file to save
    'new_file name' is set default to just "new file" but can be changed to fit purpose
    
    the function will output the other version of the input file
    """
    try:
        if file_type.lower() == "csv":
            frame = pd.read_csv(link)
            pathway = ((file_path+"/"+new_file_name+".json"))
            frame.to_json(path_or_buf = pathway, orient = "records")
        elif file_type.lower() == "json":
            frame = pd.read_json(link)
            pathway = (file_path+"/"+new_file_name+".csv")
            frame.to_csv(path_or_buf = pathway)
        else:
            print("The file type didn't process correctly. Make sure you are using a JSON or CSV file or a remote link of that file type and that everything is typed in correctly ")
    except:
        print("Either the file name or directory isn't typed in quite right-- make sure the source file exists in that pathway and try again")
        

### Converting File Types using my defined functions

In [5]:
#Convert from the remote version of the files (from github) to the other format (e.g., csv to json)

#Both kinds of file are refereneced here to show that the function works both ways.

link_convert("https://raw.githubusercontent.com/kronayne/assn/master/love_island_contestants.csv","csv", "/Users/kerryronayne/Desktop/db/assn", new_file_name = "love_island_contestants")
link_convert("https://raw.githubusercontent.com/kronayne/assn/master/love_island_contestants.json","json", "/Users/kerryronayne/Desktop/db/assn", new_file_name = "love_island_contestants")

In [6]:
#Convert the newly remote copies to the other format (e.g., csv to json)

#Again, both kinds of files are referenced to show the function works both ways.

#For the rest of the code, I used the converted CSV to JSON version to send to SQL. To use the CSV version, I would
## simple use the pd.read_csv('file name') function to convert the CSV into a pandas frame and manipulate it using
## the same methods as the JSON file to then (in the end) use the 'set_dataframe' function to send to MySQL.

file_convert("love_island_contestants","csv", "/Users/kerryronayne/Desktop/db/assn", "love_island_contestants")
file_convert("love_island_contestants","json", "/Users/kerryronayne/Desktop/db/assn", "love_island_contestants")

### Mongo

In [7]:
#Populate the converted JSON file in Mongo by creating a client connection, loadining in the JSON file, and using the
## 'insert_many' function to then insert the loaded JSON file into Mongo.

conn_str = f"mongodb://{host_name}:{ports['mongo']}/"
client = pymongo.MongoClient(conn_str)
db = client[src_dbname]

contestants = db.contestants

with open('love_island_contestants.json') as openfile:
    file_data = json.load(openfile)

contestants.insert_many(file_data)

client.close

<bound method MongoClient.close of MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)>

In [8]:
#Create another collection manually in Mongo DB. This just shows that I know how to do this method as well, and creates
## A new collection with just the season information and then inserting a unique season key for the collection.
seasons = db.season

season_data = [{"Season": 1,
                "Days of Season": 41,
                "M_Winner": "Max Morley", 
                "W_Winner":"Jess Hayes"}
               ,
               {"Season": 2,
                "Days of Season": 45,
                "M_Winner": "Nathan Massey",
                "W_Winner":"Cara De La Hoyde"}
               ,
               {"Season": 3,
                "Days of Season": 52,
                "M_Winner": "Kem Cetinay",
                "W_Winner":"Amber Davies"}
               ,
               {"Season": 4,
                "Days of Season": 59,
                "M_Winner": "Jack Fincham",
                "W_Winner":"Dani Dyer"}]

result = seasons.insert_many(season_data)
pprint.pprint(result.inserted_ids)

[ObjectId('62421c02d1f483c8a3eec8e4'),
 ObjectId('62421c02d1f483c8a3eec8e5'),
 ObjectId('62421c02d1f483c8a3eec8e6'),
 ObjectId('62421c02d1f483c8a3eec8e7')]


In [10]:
# Extract the newly inserted mongo collection and put it into Jupyter via a pandas dataframe by defining the port,
## and using the 'get_mongo' function to do the actual putting into a DF. The head function just helps show
## everytthing was extracted as it should be. 

#Along the way of doing my project and re-converting everything, I ended up with this "unnamed:0" column that is just
##a wrongly-named copy of the inserted contestant key. This doesn't really have any bearing on the project code, though,
## because when modifying the data, I first select only the columns I wanted to form the DF and THEN insert the key
##column so this 'unnamed:0' column just so happens to be subsetted out.
query = {}
port = ports["mongo"]
collection = "contestants"

df_contestants = get_mongo_dataframe(None, None, host_name, port, src_dbname, collection, query)
df_contestants.head(2)

Unnamed: 0.1,Unnamed: 0,Season,Name,Gender,Age,Ethnicity,Town,Job,Status,Days on show,Entrance Day
0,0,1,Ben Porter,Male,24,White,Wakefield,Junior manager,Dumped,2.0,32.0
1,1,1,Bethany Rogers,Female,19,White,Leeds,Dancer,Dumped,8.0,14.0


In [11]:
# This does the same extracting from mongo as the previous cell, only doing so with the manual season collectiton
query = {}
port = ports["mongo"]
collection = "season"

df_seasons = get_mongo_dataframe(None, None, host_name, port, src_dbname, collection, query)
df_seasons.head(2)

Unnamed: 0,Season,Days of Season,M_Winner,W_Winner
0,1,41,Max Morley,Jess Hayes
1,2,45,Nathan Massey,Cara De La Hoyde


### Modify columns

Modify the number of columns from the source to the destination,reducing or adding columns.

In [12]:
#Add a new column into the contestant collection with a unique key for each row-- if its already been inserted because
## of already running the code, the except error will note that.
try:
    df_contestants.insert(0, "Contestant_key", range(0, df_contestants.shape[0]))
except:
    print("This column has already been inserted. Move on to the next cell!")

In [13]:
#Perform a series of modifications/maniplations to the defined dataframes for help reporting later on, rename
### certain colulumns to be more clear or without spacees, and to eliminate unnecessary columns

#Rename columns without spaces
column_name_map = {"Entrance Day" : "Entrance_day",
                   "Days on show" : "Days_on_show"
                  }
df_contestants.rename(columns=column_name_map, inplace=True)

#Add column with the day # a given contestant left
df_contestants['Exit_day'] = df_contestants["Entrance_day"] + df_contestants['Days_on_show']

#Drop columns/make sure they are in the preferred order
df_contestants = df_contestants[['Contestant_key', 'Name','Season', 'Gender', 'Age', 'Town', 'Job','Status', 'Entrance_day', 'Exit_day', 'Days_on_show',]]

#Drop the incomplete season rows-- part of the show was ongoing when the data was tabulated and I don't want
#those to cloud/complicate the end report and conclusions
df_contestants= df_contestants[df_contestants['Season'] != 5]

#Shorten gender terms, just a preference thing!
df_contestants['Gender'].replace(['Female','Male'],['F','M'],inplace=True)


In [14]:
df_contestants.head(2)

Unnamed: 0,Contestant_key,Name,Season,Gender,Age,Town,Job,Status,Entrance_day,Exit_day,Days_on_show
0,0,Ben Porter,1,M,24,Wakefield,Junior manager,Dumped,32.0,34.0,2.0
1,1,Bethany Rogers,1,F,19,Leeds,Dancer,Dumped,14.0,22.0,8.0


In [15]:
#Add an index column to the other collection
try:
    df_seasons.insert(0, "Season_key", range(0, df_seasons.shape[0]))
except:
    print("This column has already been inserted. Move on to the next cell!")

In [16]:
df_seasons.head(2)

Unnamed: 0,Season_key,Season,Days of Season,M_Winner,W_Winner
0,0,1,41,Max Morley,Jess Hayes
1,1,2,45,Nathan Massey,Cara De La Hoyde


### Write to SQL Database

The converted (new) file should be written to disk (local file) or written to a SQL database.

In [17]:
#Create a new project-specific database in MySQL by creating a connection with the sql alchemy package. If it has
## already been created, the exception will note that.

try:
    exec_sql = f"CREATE DATABASE `{dst_dbname}`;"
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    sqlEngine.execute(exec_sql)
    sqlEngine.execute(f"USE {dst_dbname};")
except:
    print("This database already exists. Either drop the scheme in MySQL or move on to the next cell!")

In [18]:
#Sending the Pandas dataframes to mySQL using the 'set_dataframe' function in class.

dataframe = df_contestants
table_name = 'love_island_contestants'
primary_key = 'Contestant_key'
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

In [19]:
#Doing the same sending as the previous cell, only with the seasons DF
dataframe = df_seasons
table_name = 'love_island_seasons'
primary_key = 'Season_key'
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

In [20]:
#Verify the above DFs were correctly sent from this code into MySQL by using the 'get_sql' function from class,
##defining a SQL query and making that into a pandas DF and checking that it looks how I want it to!

sql_contestants = "SELECT * FROM project_1_sql.love_island_contestants;"
df_contestants = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, sql_contestants)
df_contestants.head(2)

Unnamed: 0,Contestant_key,Name,Season,Gender,Age,Town,Job,Status,Entrance_day,Exit_day,Days_on_show
0,0,Ben Porter,1,M,24,Wakefield,Junior manager,Dumped,32.0,34.0,2.0
1,1,Bethany Rogers,1,F,19,Leeds,Dancer,Dumped,14.0,22.0,8.0


In [21]:
#Verify in the same way as the previous cell, only with the seasons DF
sql_seasons = "SELECT * FROM project_1_sql.love_island_contestants;"
df__seasons = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, sql_seasons)
df_seasons.head(2)

Unnamed: 0,Season_key,Season,Days of Season,M_Winner,W_Winner
0,0,1,41,Max Morley,Jess Hayes
1,1,2,45,Nathan Massey,Cara De La Hoyde


### Summary

Generate a brief summary of the data file ingestion including:
a) Number of records
b) Number of columns

In [22]:
#Generate a series of SQL queries to get more detailed information from the SQL version of the data. One looks at the
## most common jobs and towns of the shows contestants in the first four seasons, and the latter two look at
## the average days on the show, entrance day, and age of winners compared to the average contestant
average_jobs = "SELECT Job, COUNT(*) AS 'Frequency'FROM project_1_sql.love_island_contestants GROUP BY Job ORDER BY COUNT(*) DESC LIMIT 10;"
average_towns = "SELECT Town, COUNT(*) AS 'Frequency' FROM project_1_sql.love_island_contestants GROUP BY Town ORDER BY COUNT(*) DESC LIMIT 10;"
winner_averages = "SELECT ROUND(AVG(Days_on_show),2) AS 'Average Days', ROUND(AVG(Age),2) AS 'Average Age', ROUND(AVG(Entrance_day),2) AS 'Average Entrance Day' FROM project_1_sql.love_island_contestants WHERE Status = 'Winner';"
standard_averages = "SELECT ROUND(AVG(Days_on_show),2) AS 'Average Days', ROUND(AVG(Age),2) AS 'Average Age', ROUND(AVG(Entrance_day),2) AS 'Average Entrance Day' FROM project_1_sql.love_island_contestants"

In [23]:
#Use the 'get_sql' function from before to set the output of the ^^ SQL queries into a dataframe to then print out in
## a report. This way, the selected data/averages/counts are in Jupyter rather than just in MySQL

df_avg_jobs = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, average_jobs)
df_avg_towns = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, average_towns)
df_stn_avg = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, standard_averages)
df_win_avg = get_sql_dataframe(user_id, pwd, host_name, dst_dbname, winner_averages)

In [24]:
#Print out all of the summary statistics! I used a combination of print statements and indexed dataframes to provide
## a general overview of the first four seasons of the show and see if there is anything that stands out! 

#One thing to note is that the jobs and town information was weirdly put into the source dataframe so it might be 
##a bit skewed as some people have two jobs listed together in the same one cell, and some may be from more remote
##towns and just say they are from a city like London to make things simpler in the actual show. Funny thing is I only
##really noticed the multiple jobs thing when only two contestants were only 2 scaffolders listed because it feels like
##every guy on Love Island says he does scaffolding. It's a whole bit my friends and I have when doing our Islander
##impressions!

#Also, out of the winner vs average contestant comparison-- it makes sense most everyone is the same age simply because
## the contestant pool is only ever 20 somethings (there isn't much variation). I hadn't really noticed when watching
## that most of the winners were original contestants rather than someone from Casa Amor or a mid-season bombshell.
## There is a clear adavantage to coming earlier! More time to form connections to get a friend save later on if 
## you weren't coupled up, and control when it comes time for Casa Amor (I realize if you haven't watched the show,
## this probably doesn't make any sense-- but it was cool for me to see!)


print("GENERAL CONTESTANT SUMMARY:")
print("Number of records: ", df_contestants.shape[0])
print("Number of columns: ", df_contestants.shape[1])

print("\nGENERAL SEASON SUMMARY:")
print("Number of records: ", df_seasons.shape[0])
print("Number of columns: ", df_seasons.shape[1])

print("\nMost frequent contestant jobs:\n", df_avg_jobs.head(10))
print("\nMost frequent contestant hometowns:\n",df_avg_towns.head(10))

difference1 = ((df_win_avg["Average Days"][0])-(df_stn_avg["Average Days"][0]))
difference2 = ((df_stn_avg["Average Age"][0])-(df_win_avg["Average Age"][0]))
difference3 = ((df_stn_avg["Average Entrance Day"][0])-(df_win_avg["Average Entrance Day"][0]))

print("\nWinners vs. the average contestant:")
print("Number of days in competition:")
print("Winners:",df_win_avg["Average Days"][0])
print("Average:",df_stn_avg["Average Days"][0])
print("Difference:",round(difference1,2), "days\n")

print("Contestant age:")
print("Winners:",df_win_avg["Average Age"][0], "years old")
print("Average:",df_stn_avg["Average Age"][0], "years old")
print("Difference:",round(difference2,2), "years\n")

print("Entrance day:")
print("Winners:","Day",df_win_avg["Average Entrance Day"][0])
print("Average:","Day", df_stn_avg["Average Entrance Day"][0])
print("Difference:",round(difference3,2), "days\n")
     

GENERAL CONTESTANT SUMMARY:
Number of records:  119
Number of columns:  11

GENERAL SEASON SUMMARY:
Number of records:  4
Number of columns:  5

Most frequent contestant jobs:
                   Job  Frequency
0               Model         10
1    Personal trainer          7
2       Glamour model          5
3      Make-up artist          4
4             Student          3
5         Hairdresser          2
6            Salesman          2
7              Dancer          2
8  Property developer          2
9        Entrepreneur          2

Most frequent contestant hometowns:
             Town  Frequency
0         London         16
1          Essex         14
2     Manchester          6
3           Kent          5
4      Newcastle          5
5     Birmingham          5
6  Hertfordshire          4
7    East London          3
8      Liverpool          3
9         Oxford          2

Winners vs. the average contestant:
Number of days in competition:
Winners: 47.62
Average: 22.39
Difference: 25.2