In [1]:
import csv
from DATA225utils import make_connection
import pandas as pd
import os
from configparser import ConfigParser
from mysql.connector import MySQLConnection

In [2]:
def read_config(config_file = 'config.ini', section = 'mysql'):
    """
    Read a configuration file config_file and the given section. 
    If successful, return the configuration as a dictionary,
    else raise an exception. 
    """
    parser = ConfigParser()
    
    # Does the configuration file exist?
    if os.path.isfile(config_file):
        parser.read(config_file)
    else:
        raise Exception(f"Configuration file '{config_file}' "
                        "doesn't exist.")
    
    config = {}
    
    # Does it have the right section?
    if parser.has_section(section):
        
        # Parse the configuration file.
        items = parser.items(section)
        
        # Construct the parameter dictionary.
        for item in items:
            config[item[0]] = item[1]
            
    else:
        raise Exception(f"Section '{section}' missing "
                        f"in configuration file '{config_file}'.")
    
    return config

In [3]:
db_config = read_config('movies.ini')
db_config

{'host': 'localhost',
 'database': 'movies',
 'user': 'root',
 'password': 'seekrit'}

In [4]:
def make_connection(config_file = 'config.ini', section = 'mysql'):
    """
    Make a connection to a database with the configuration file
    config_file and the given section. If successful, return 
    the connection, else raise an exception.
    """
    try:
        db_config = read_config(config_file, section)            
        conn = MySQLConnection(**db_config)
        
        if conn.is_connected():
            return conn
                
    except Error as e:
        raise Exception(f'Connection failed.\n{e}')

In [5]:
conn = make_connection('movies.ini')
conn

<mysql.connector.connection.MySQLConnection at 0x1fac006dc10>

In [6]:
cursor = conn.cursor()
cursor

<mysql.connector.cursor.MySQLCursor at 0x1fabe8cbb50>

# Question 1

In [None]:
#Create and load a database table. Then write one or more INSERT INTO command
#with an embedded SELECT to create one or more tables from the first table. Use CASE
#with the SELECT to perform data transformation(s). Display the contents of the first table
#and the newly created table(s).

In [8]:
cursor.execute('DROP TABLE IF EXISTS MM')

sql = ( """
        CREATE TABLE MM
        (
          ReleaseDate DATE NOT NULL,
          IMDB_ID VARCHAR(255) NOT NULL,
          Original_Title VARCHAR(255) NOT NULL,
          Budget CHAR(255) NOT NULL,
          PRIMARY KEY (IMDB_ID)
        )
        """
      )

cursor.execute(sql)

In [9]:
sql = ("""
    INSERT INTO MM
    SELECT ReleaseDate, IMDB_ID, Original_Title, 
        CASE 
            WHEN Budget < 6000000 THEN 'Low'
            WHEN Budget > 6000000 THEN 'High'
            ELSE 'Right-Amount'
        END AS Budget
    FROM Movie_Metadata
    WHERE ReleaseDate > '1995-07-18'

    """)

cursor.execute(sql)
conn.commit()

### Here we classified budget as Low: if less than 6000000 dollars, High: if more than 6000000 dollars, and Right-Amount: exactly 6000000 dollars.

In [11]:
select_vendor = ("""
                 SELECT * FROM MM
                 """
                  )
cursor.execute(select_vendor)
df1 = pd.DataFrame(cursor.fetchall(), columns = ['Release_Date', 'IMDB_ID', 'Original_Title', 'Budget'])
display(df1)

Unnamed: 0,Release_Date,IMDB_ID,Original_Title,Budget
0,1995-11-16,tt0113189,GoldenEye,High
1,1995-10-27,tt0113627,Leaving Las Vegas,Low
2,1995-09-22,tt0114369,Se7en,High
3,1995-10-30,tt0114709,Toy Story,High
4,1995-07-19,tt0114814,The Usual Suspects,Right-Amount


# Question 2

In [12]:
#Use one or more aggregate functions with GROUP BY. Explain in a sentence or two what
#the nested query is supposed to do and display the result.

### This aggregate function gives the frequency for every category inside the MM table for the Budget.

In [13]:
sql2 =  ("""
        SELECT Budget, COUNT(*)
        FROM MM
        GROUP BY Budget
        """)

cursor.execute(sql2)
rows = cursor.fetchall()
df2 = pd.DataFrame(rows, columns=['Category', 'Count'])
display(df2)

Unnamed: 0,Category,Count
0,High,3
1,Low,1
2,Right-Amount,1


# Question 3

In [14]:
#Use one or more aggregate functions with GROUP BY HAVING. Explain in a sentence or
#two what the nested query is supposed to do and display the result.

### This aggregate function shows the frequency for every category with only 1 count.

In [15]:
sql3 = ("""
                 SELECT Budget, COUNT(*)
                 FROM MM
                 GROUP BY Budget
                 HAVING COUNT(*) = 1
                
                 """
                  )
cursor.execute(sql3)
df1 = pd.DataFrame(cursor.fetchall(), columns = ['Category', 'Count'])
display(df1)

Unnamed: 0,Category,Count
0,Low,1
1,Right-Amount,1


# Question 4

In [16]:
#Write a SELECT query with a nested SELECT. Explain in a sentence or two what the
#nested query is supposed to do and display the result.

### This query selects the IMDB_ID and Homepage from movie_metadata where the Budget from mm is High

In [17]:
sql4 = ("""
                 SELECT IMDB_ID, Homepage
                 FROM movie_metadata
                 WHERE IMDB_ID IN (SELECT IMDB_ID
                                   FROM mm
                                   WHERE Budget = 'High')
                 """
                  )
cursor.execute(sql4)
df1 = pd.DataFrame(cursor.fetchall(), columns = ['IMDB_ID', 'Homepage'])
display(df1)


Unnamed: 0,IMDB_ID,Homepage
0,tt0113189,http://www.mgm.com/view/movie/757/Goldeneye/
1,tt0114369,http://www.sevenmovie.com/
2,tt0114709,http://toystory.disney.com/toy-story


# Question 5

In [18]:
# Perform a left outer join between two tables and display the result.

### This displays the first name, last name, user name, the rating the user gave, and the movie id of the rating using left outer join.

In [19]:
sql5 = ("""
         SELECT u.First_Name, u.Last_Name, u.User_Name, r.Ratings, r.Movie_ID
         FROM Users u
         LEFT OUTER JOIN movie_ratings r on r.User_ID = u.User_ID
        """
        )
cursor.execute(sql5)
df1 = pd.DataFrame(cursor.fetchall(), columns=['First Name', 'Last Name', 'User Name', 'Rating', 'Movie ID'])
display(df1)


Unnamed: 0,First Name,Last Name,User Name,Rating,Movie ID
0,F,L,U,,
1,Joseph,Chang,joseph123,6.5,1.0
2,Justin,Wang,justinw002,8.0,47.0
3,Ron,Mak,ronmak145,9.5,50.0
4,Satyaprakash,Mishra,sp17mishra,6.0,25.0
5,Shrey,Jain,shrey01,5.5,10.0


# Question 6

In [20]:
# Perform a right outer join between two tables and display the result.

### This displays all from Users right outer joined with movie ratings

In [27]:
sql6 = ("""
         SELECT *
         FROM Users u
         RIGHT OUTER JOIN movie_ratings r on r.User_ID = u.User_ID        
        """
        )
cursor.execute(sql6)
df1 = pd.DataFrame(cursor.fetchall(), columns=['User ID', 'First Name', 'Last Name', 'User Name', 'Timestamp', 'Rating', 'User ID', 'Receipt', 'Movie ID'])
display(df1)


Unnamed: 0,User ID,First Name,Last Name,User Name,Timestamp,Rating,User ID.1,Receipt,Movie ID
0,111,Joseph,Chang,joseph123,1425941529,6.5,111,1,1
1,222,Shrey,Jain,shrey01,1425942435,5.5,222,2,10
2,333,Satyaprakash,Mishra,sp17mishra,1425941300,6.0,333,3,25
3,444,Justin,Wang,justinw002,1425942007,8.0,444,4,47
4,555,Ron,Mak,ronmak145,1425942139,9.5,555,5,50


# Question 7

In [22]:
# Perform a full outer join between two tables and display the result.

### This is a full outer join of users and movie ratings. Created by unioning left and right outer joins.

In [29]:
# given query
sql7 = ("""
         SELECT *
         FROM Users u
         LEFT OUTER JOIN movie_ratings r on r.User_ID = u.User_ID
         UNION
         SELECT *
         FROM Users u
         RIGHT OUTER JOIN movie_ratings r on r.User_ID = u.User_ID            
        """
        )
cursor.execute(sql7)
df1 = pd.DataFrame(cursor.fetchall(),columns=['User ID', 'First Name', 'Last Name', 'User Name', 'Timestmap', 'Rating', 'User ID', 'Receipt','Movie ID'])
display(df1)


Unnamed: 0,User ID,First Name,Last Name,User Name,Timestmap,Rating,User ID.1,Receipt,Movie ID
0,777,F,L,U,,,,,
1,111,Joseph,Chang,joseph123,1425942000.0,6.5,111.0,1.0,1.0
2,444,Justin,Wang,justinw002,1425942000.0,8.0,444.0,4.0,47.0
3,555,Ron,Mak,ronmak145,1425942000.0,9.5,555.0,5.0,50.0
4,333,Satyaprakash,Mishra,sp17mishra,1425941000.0,6.0,333.0,3.0,25.0
5,222,Shrey,Jain,shrey01,1425942000.0,5.5,222.0,2.0,10.0


# Question 8

In [24]:
# Create a view and display its contents. Use the view in a join with other table(s) and display the result.

### Creates a view on movie_metadata_genres where genres = Drama
### Performs a left outer join on movie_links and the view on IMDB_ID

In [34]:
# given query
cursor.execute("DROP VIEW IF EXISTS genres")
sql8 = ("""
        CREATE VIEW genres AS
        SELECT *
        FROM movie_metadata_genres
        WHERE Genres = 'Drama'
         """
        )
cursor.execute(sql8)
sql81 = ("""
        SELECT * from genres
        """)
cursor.execute(sql81)
df1 = pd.DataFrame(cursor.fetchall(), columns=['Genre', 'IMDB ID'])
display(df1)

sql8join = ("""
            SELECT * 
            FROM movie_links m
            LEFT OUTER JOIN genres on m.IMDB_ID = genres.IMDB_ID
            """)
cursor.execute(sql8join)
df1 = pd.DataFrame(cursor.fetchall(), columns=['Movie ID', 'TMDB ID','IMDB ID', 'Genre', 'IMDB ID'])
display(df1)

Unnamed: 0,Genre,IMDB ID
0,Drama,tt0113627
1,Drama,tt0114814


Unnamed: 0,Movie ID,TMDB ID,IMDB ID,Genre,IMDB ID.1
0,1,862,tt0114709,,
1,10,710,tt0113189,,
2,25,451,tt0113627,Drama,tt0113627
3,47,807,tt0114369,,
4,50,629,tt0114814,Drama,tt0114814
