In [1]:
#dependencies
import pandas as pd
from sqlalchemy import create_engine, inspect
import numpy as np

# Extract:

### Kaggle Dataset: https://www.kaggle.com/lava18/google-play-store-apps?select=googleplaystore.csv

### CSVs:
- ***googleplaystore.csv:*** Basic app details
- ***googleplaystore_user_reviews.csv:*** The first 100 user reviews for apps on the Google Play Store through the letter 'H'


In [2]:
#import two csv files
play_store = "Resources/googleplaystore.csv"
reviews="Resources/googleplaystore_user_reviews.csv"

play_store_df= pd.read_csv(play_store)
reviews_df= pd.read_csv(reviews)

# Transform:

In [3]:
# View the Play Store DF
play_store_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [4]:
# View the Reviews DF
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## Cleaning:
### - Drop N/A values
### - Reset the Index
### - Create a Primary Key ID column
### - Set the ID column as the Index

In [5]:
# Drop the NA values
play_store_df = play_store_df.dropna(how="any")
play_store_df = play_store_df.reset_index()
play_store_df = play_store_df.rename(columns={"index": "id"})
play_store_df.set_index("id", inplace=True)
play_store_df.head()

Unnamed: 0_level_0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [6]:
# Drop the NA values
reviews_df = reviews_df.dropna(how="any")
reviews_df = reviews_df.reset_index()
reviews_df = reviews_df.rename(columns={"index": "id"})
reviews_df.set_index("id", inplace=True)
reviews_df.head()

Unnamed: 0_level_0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3


### - Drop unnecessary columns & keep the columns we want to use

In [7]:
#drop columns from play store dataframe
del play_store_df["Category"]
del play_store_df["Size"]
del play_store_df["Type"]
del play_store_df["Last Updated"]
del play_store_df["Current Ver"]    
del play_store_df["Android Ver"]
play_store_df.head()

Unnamed: 0_level_0,App,Rating,Reviews,Installs,Price,Content Rating,Genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,"10,000+",0,Everyone,Art & Design
1,Coloring book moana,3.9,967,"500,000+",0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,"5,000,000+",0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,"50,000,000+",0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,"100,000+",0,Everyone,Art & Design;Creativity


In [8]:
#drop columns from reviews dataframe
del reviews_df["Translated_Review"]
reviews_df.head()

Unnamed: 0_level_0,App,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10 Best Foods for You,Positive,1.0,0.533333
1,10 Best Foods for You,Positive,0.25,0.288462
3,10 Best Foods for You,Positive,0.4,0.875
4,10 Best Foods for You,Positive,1.0,0.3
5,10 Best Foods for You,Positive,1.0,0.3


### - Rename columns to match the SQL database tables

In [9]:
# Rename the columns for reviews df
reviews_df = reviews_df.rename(columns={"App": "app", "Sentiment": "sentiment", "Sentiment_Polarity": "sentiment_polarity", 
                                        "Sentiment_Subjectivity": "sentiment_subjectivity"})
reviews_df.head()

Unnamed: 0_level_0,app,sentiment,sentiment_polarity,sentiment_subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10 Best Foods for You,Positive,1.0,0.533333
1,10 Best Foods for You,Positive,0.25,0.288462
3,10 Best Foods for You,Positive,0.4,0.875
4,10 Best Foods for You,Positive,1.0,0.3
5,10 Best Foods for You,Positive,1.0,0.3


In [10]:
# Rename the columns for play store df
play_store_df = play_store_df.rename(columns={"App": "app", "Rating": "rating", "Reviews": "reviews", 
                                              "Installs": "installs", "Price": "price", 
                                              "Content Rating": "content_rating", "Genres": "genres"})
play_store_df.head()

Unnamed: 0_level_0,app,rating,reviews,installs,price,content_rating,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,"10,000+",0,Everyone,Art & Design
1,Coloring book moana,3.9,967,"500,000+",0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,"5,000,000+",0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,"50,000,000+",0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,"100,000+",0,Everyone,Art & Design;Creativity


### - Re-format the Installs column to remove the "+" and the ","

In [11]:
# Remove the plus sign on Installs
play_store_df["installs"] = play_store_df["installs"].str.replace('+', '')
play_store_df["installs"] = play_store_df["installs"].str.replace(',', '').astype(int)
play_store_df.head()

Unnamed: 0_level_0,app,rating,reviews,installs,price,content_rating,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,10000,0,Everyone,Art & Design
1,Coloring book moana,3.9,967,500000,0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,5000000,0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,50000000,0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,100000,0,Everyone,Art & Design;Creativity


### - Change the data types to reflect the data types used in the SQL database

In [12]:
# Change the data type of reviews to int
play_store_df["reviews"] = play_store_df["reviews"].astype(int)

# Remove the $ and change the data type to float
play_store_df["price"] = play_store_df["price"].str.replace('$', '').astype(float)

# Load:

In [13]:
# Export csvs
play_store_df.to_csv("Output/play_store.csv")
reviews_df.to_csv("Output/reviews.csv")

## Connect to SQL Database

In [14]:
# Connect to the sql database
connection_string = "postgres:postgres@localhost:5432/google_play_store_db"
engine = create_engine(f'postgresql://{connection_string}')

In [15]:
# Get the table names
Inspector = inspect(engine)
Inspector.get_table_names()

['apps', 'reviews']

## Load the cleaned dataframes to the SQL Database

In [16]:
# Append the dataframes to sql database
play_store_df.to_sql(name='apps', con=engine, if_exists='append', index=True)
reviews_df.to_sql(name='reviews', con=engine, if_exists='append', index=True)

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "apps_pkey"
DETAIL:  Key (id)=(0) already exists.

[SQL: INSERT INTO apps (id, app, rating, reviews, installs, price, content_rating, genres) VALUES (%(id)s, %(app)s, %(rating)s, %(reviews)s, %(installs)s, %(price)s, %(content_rating)s, %(genres)s)]
[parameters: ({'id': 0, 'app': 'Photo Editor & Candy Camera & Grid & ScrapBook', 'rating': 4.1, 'reviews': 159, 'installs': 10000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design'}, {'id': 1, 'app': 'Coloring book moana', 'rating': 3.9, 'reviews': 967, 'installs': 500000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design;Pretend Play'}, {'id': 2, 'app': 'U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'rating': 4.7, 'reviews': 87510, 'installs': 5000000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design'}, {'id': 3, 'app': 'Sketch - Draw & Paint', 'rating': 4.5, 'reviews': 215644, 'installs': 50000000, 'price': 0.0, 'content_rating': 'Teen', 'genres': 'Art & Design'}, {'id': 4, 'app': 'Pixel Draw - Number Art Coloring Book', 'rating': 4.3, 'reviews': 967, 'installs': 100000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design;Creativity'}, {'id': 5, 'app': 'Paper flowers instructions', 'rating': 4.4, 'reviews': 167, 'installs': 50000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design'}, {'id': 6, 'app': 'Smoke Effect Photo Maker - Smoke Editor', 'rating': 3.8, 'reviews': 178, 'installs': 50000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design'}, {'id': 7, 'app': 'Infinite Painter', 'rating': 4.1, 'reviews': 36815, 'installs': 1000000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Art & Design'}  ... displaying 10 of 9360 total bound parameter sets ...  {'id': 10839, 'app': 'The SCP Foundation DB fr nn5n', 'rating': 4.5, 'reviews': 114, 'installs': 1000, 'price': 0.0, 'content_rating': 'Mature 17+', 'genres': 'Books & Reference'}, {'id': 10840, 'app': 'iHoroscope - 2018 Daily Horoscope & Astrology', 'rating': 4.5, 'reviews': 398307, 'installs': 10000000, 'price': 0.0, 'content_rating': 'Everyone', 'genres': 'Lifestyle'})]
(Background on this error at: http://sqlalche.me/e/14/gkpj)

## Check the data was loaded correctly by querying the database

In [None]:
# Join the two tables to get results
engine.execute(
    """SELECT a.app, r.sentiment, a.reviews
    FROM apps as a
    INNER JOIN reviews as r on
    a.app = r.app
    LIMIT 10""").fetchall()