In [None]:
import pandas as pd

In [None]:
# get the final dataframe from admins_notebook.ipynb
%store -r df

In [None]:
df.head(2)

We need a SQL query for the following:
1. `adult` - a boolean value
2. `genres` - json array of genres (strings)
3. `language` - short 5-6 length string
4. `title` - title of the movie(string)
5. `overview` - huge text
6. `popularity` - integer from 0 to 10 (inclusive) (with check)
7. `poster_path` - long string (web link to poster)
8. `vote_average` - integer from 0 to 10 (inclusive) (with check)
9. `vote_count` - large integer
10. `keywords` - json array of strings
11. `cast` - array of json array of strings
12. `row_id` - integer(primary key), auto increment = 1
13. `release_year` - 4 digit release year

#### <font color=red>IF THE FOLLOWING SQL DOES NOT RUN, THEN SKIP THIS CELL AND RUN OTHER CELLS</font>

In [None]:
-- SQL query to create movies table in the database
-- If you want, you can download the SQL files available on the repository
--
-- Table structure for table `movies`
--

DROP TABLE IF EXISTS movies;

CREATE TABLE movies (
    row_id BIGINT auto_increment PRIMARY KEY, 
	adult BOOLEAN, 
	language VARCHAR(4), 
	title TEXT, 
    genres TEXT,
	overview TEXT, 
	popularity FLOAT(53) CHECK (popularity >= 0.0 AND popularity <= 100.0), 
	poster_path TEXT, 
	vote_average FLOAT(53) CHECK (vote_average >= 0.0 AND vote_average <= 10.0), 
	vote_count BIGINT, 
    cast JSON,
    keywords TEXT,
	release_year INTEGER
);

##### Following are the columns whose data has to be put in the database directly from the dataframe ( no modification required before persisting )
<ul>
    <li>row_id</li>
    <li>adult</li>
    <li>language</li>
    <li>title</li>
    <li>overview</li>
    <li>popularity</li>
    <li>poster_path</li>
    <li>vote_average</li>
    <li>vote_count</li>
    <li>release_year</li>
    <li>genres</li>
</ul>

In [None]:
pip install SQLAlchemy

In [None]:
pip install pymysql

In [None]:
from sqlalchemy import create_engine
import pymysql

db_url = "mysql+pymysql://movie_database:movie_database@localhost/movie_database"
engine = create_engine(db_url, echo=True)

# Filtered dataframe
filtered_df = df.drop(['keywords', 'cast', 'row_id'], axis=1) # Note that we need to remove the row_id column as it is a primary key set to be auto increment

# Insert data into the specified columns, appending to the existing table
filtered_df.to_sql('movies', con=engine, if_exists='append', index=False)

In [None]:
df.head(1)

In [None]:
%store -r keywords_column

#### <font color=orange>FOLLOWING SQL QUERY CREATES A TABLE FOR STORING THE KEYWORDS RELATED TO A MOVIE (WITHTOUT NORMALIZATION, NO MANY-TO-MANY RELATION)</font>

In [None]:
DROP TABLE IF EXISTS keywords;

CREATE TABLE keywords (
    k_id BIGINT,
    kword VARCHAR(255)
);

#### populate the table `keywords` with the keywords from the `keywords_column`

In [None]:
# Flatten the list of lists and create 'id' values
flattened_data = [(i, keyword) for i, sublist in enumerate(keywords_column) for keyword in sublist]

# Create a Pandas DataFrame
keywords_column_df = pd.DataFrame(flattened_data, columns=["k_id", "kword"])

##### Export CSV (Optional)

In [None]:
keywords_column_df.to_csv('kc.csv', index=False)

In [None]:
# Insert data into the available columns from the dataframe, appending to the existing table (keywords)
keywords_column_df.to_sql('keywords', con=engine, if_exists='append', index=False)