In [1]:
#dependencies
import pandas as pd
from sqlalchemy import create_engine, inspect
import numpy as np

# Extract:

### Kaggle Dataset: https://www.kaggle.com/lava18/google-play-store-apps?select=googleplaystore.csv

### CSVs:
- ***googleplaystore.csv:*** Basic app details
- ***googleplaystore_user_reviews.csv:*** The first 100 user reviews for apps on the Google Play Store through the letter 'H'


In [2]:
#import two csv files
play_store = "Resources/googleplaystore.csv"
reviews="Resources/googleplaystore_user_reviews.csv"

play_store_df= pd.read_csv(play_store)
reviews_df= pd.read_csv(reviews)

# Transform:

In [3]:
# View the Play Store DF
play_store_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [4]:
# View the Reviews DF
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## Cleaning:
### - Drop N/A values
### - Reset the Index
### - Create a Primary Key ID column
### - Set the ID column as the Index

In [5]:
# Drop the NA values
play_store_df = play_store_df.dropna(how="any")
play_store_df = play_store_df.reset_index(drop=True).reset_index()
play_store_df = play_store_df.rename(columns={"index": "id"})
play_store_df.set_index("id", inplace=True)
play_store_df.head()

Unnamed: 0_level_0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [6]:
# Drop the NA values
reviews_df = reviews_df.dropna(how="any")
reviews_df = reviews_df.reset_index(drop=True).reset_index()
reviews_df = reviews_df.rename(columns={"index": "id"})
reviews_df.set_index("id", inplace=True)
reviews_df.head()

Unnamed: 0_level_0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
3,10 Best Foods for You,Best idea us,Positive,1.0,0.3
4,10 Best Foods for You,Best way,Positive,1.0,0.3


### - Drop unnecessary columns & keep the columns we want to use

In [7]:
#drop columns from play store dataframe
del play_store_df["Category"]
del play_store_df["Size"]
del play_store_df["Type"]
del play_store_df["Last Updated"]
del play_store_df["Current Ver"]    
del play_store_df["Android Ver"]
play_store_df

Unnamed: 0_level_0,App,Rating,Reviews,Installs,Price,Content Rating,Genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,"10,000+",0,Everyone,Art & Design
1,Coloring book moana,3.9,967,"500,000+",0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,"5,000,000+",0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,"50,000,000+",0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,"100,000+",0,Everyone,Art & Design;Creativity
...,...,...,...,...,...,...,...
9355,FR Calculator,4.0,7,500+,0,Everyone,Education
9356,Sya9a Maroc - FR,4.5,38,"5,000+",0,Everyone,Education
9357,Fr. Mike Schmitz Audio Teachings,5.0,4,100+,0,Everyone,Education
9358,The SCP Foundation DB fr nn5n,4.5,114,"1,000+",0,Mature 17+,Books & Reference


In [8]:
#drop columns from reviews dataframe
del reviews_df["Translated_Review"]
reviews_df.head()

Unnamed: 0_level_0,App,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10 Best Foods for You,Positive,1.0,0.533333
1,10 Best Foods for You,Positive,0.25,0.288462
2,10 Best Foods for You,Positive,0.4,0.875
3,10 Best Foods for You,Positive,1.0,0.3
4,10 Best Foods for You,Positive,1.0,0.3


### - Drop Duplicates

In [9]:
play_store_df.drop_duplicates("App", inplace=True)
play_store_df

Unnamed: 0_level_0,App,Rating,Reviews,Installs,Price,Content Rating,Genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,"10,000+",0,Everyone,Art & Design
1,Coloring book moana,3.9,967,"500,000+",0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,"5,000,000+",0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,"50,000,000+",0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,"100,000+",0,Everyone,Art & Design;Creativity
...,...,...,...,...,...,...,...
9355,FR Calculator,4.0,7,500+,0,Everyone,Education
9356,Sya9a Maroc - FR,4.5,38,"5,000+",0,Everyone,Education
9357,Fr. Mike Schmitz Audio Teachings,5.0,4,100+,0,Everyone,Education
9358,The SCP Foundation DB fr nn5n,4.5,114,"1,000+",0,Mature 17+,Books & Reference


### - Rename columns to match the SQL database tables

In [10]:
# Rename the columns for reviews df
reviews_df = reviews_df.rename(columns={"App": "app", "Sentiment": "sentiment", "Sentiment_Polarity": "sentiment_polarity", 
                                        "Sentiment_Subjectivity": "sentiment_subjectivity"})
reviews_df.head()

Unnamed: 0_level_0,app,sentiment,sentiment_polarity,sentiment_subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10 Best Foods for You,Positive,1.0,0.533333
1,10 Best Foods for You,Positive,0.25,0.288462
2,10 Best Foods for You,Positive,0.4,0.875
3,10 Best Foods for You,Positive,1.0,0.3
4,10 Best Foods for You,Positive,1.0,0.3


In [30]:
reviews_df

Unnamed: 0_level_0,app,sentiment,sentiment_polarity,sentiment_subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10 Best Foods for You,Positive,1.000000,0.533333
1,10 Best Foods for You,Positive,0.250000,0.288462
2,10 Best Foods for You,Positive,0.400000,0.875000
3,10 Best Foods for You,Positive,1.000000,0.300000
4,10 Best Foods for You,Positive,1.000000,0.300000
...,...,...,...,...
37422,Housing-Real Estate & Property,Positive,0.173333,0.486667
37423,Housing-Real Estate & Property,Positive,0.225000,0.447222
37424,Housing-Real Estate & Property,Negative,-0.287500,0.250000
37425,Housing-Real Estate & Property,Positive,0.800000,1.000000


In [12]:
# Rename the columns for play store df
play_store_df = play_store_df.rename(columns={"App": "app", "Rating": "rating", "Reviews": "reviews", 
                                              "Installs": "installs", "Price": "price", 
                                              "Content Rating": "content_rating", "Genres": "genres"})
play_store_df.head()

Unnamed: 0_level_0,app,rating,reviews,installs,price,content_rating,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,"10,000+",0,Everyone,Art & Design
1,Coloring book moana,3.9,967,"500,000+",0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,"5,000,000+",0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,"50,000,000+",0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,"100,000+",0,Everyone,Art & Design;Creativity


### - Re-format the Installs column to remove the "+" and the ","

In [13]:
# Remove the plus sign on Installs
play_store_df["installs"] = play_store_df["installs"].str.replace('+', '')
play_store_df["installs"] = play_store_df["installs"].str.replace(',', '').astype(int)
play_store_df.head()

Unnamed: 0_level_0,app,rating,reviews,installs,price,content_rating,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Photo Editor & Candy Camera & Grid & ScrapBook,4.1,159,10000,0,Everyone,Art & Design
1,Coloring book moana,3.9,967,500000,0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",4.7,87510,5000000,0,Everyone,Art & Design
3,Sketch - Draw & Paint,4.5,215644,50000000,0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,4.3,967,100000,0,Everyone,Art & Design;Creativity


### - Change the data types to reflect the data types used in the SQL database

In [14]:
# Change the data type of reviews to int
play_store_df["reviews"] = play_store_df["reviews"].astype(int)

# Remove the $ and change the data type to float
play_store_df["price"] = play_store_df["price"].str.replace('$', '').astype(float)

# Load:

In [None]:
# Export csvs
play_store_df.to_csv("Output/play_store.csv")
reviews_df.to_csv("Output/reviews.csv")

## Connect to SQL Database

In [47]:
# Connect to the sql database
connection_string = "postgres:postgres@localhost:5432/google_play_store_db"
engine = create_engine(f'postgresql://{connection_string}')

In [48]:
# Get the table names
Inspector = inspect(engine)
Inspector.get_table_names()

['apps', 'reviews']

## Table Schemas:
### apps:
- id (PK)
- app
- rating
- reviews
- installs
- price 
- content_rating
- genres

### reviews:
- id (PK)
- app
- sentiment
- sentiment_polarity
- sentiment_subjectivity
- app_id (FK - apps (id) )

In [None]:
# Drop both tables: Reset
engine.execute("DROP TABLE reviews")
engine.execute("DROP TABLE apps")

## Load the cleaned dataframes to the SQL Database

In [49]:
# Append the dataframes to sql database
play_store_df.to_sql(name='apps', con=engine, if_exists='append', index=True)
reviews_df.to_sql(name='reviews', con=engine, if_exists='append', index=True)

## Check the data was loaded correctly by querying the database

In [50]:
# Query the database
engine.execute(
    """SELECT *
    FROM apps
    LIMIT 10""").fetchall()

[(0, 'Photo Editor & Candy Camera & Grid & ScrapBook', Decimal('4.1'), 159, 10000, '0.0', 'Everyone', 'Art & Design'),
 (1, 'Coloring book moana', Decimal('3.9'), 967, 500000, '0.0', 'Everyone', 'Art & Design;Pretend Play'),
 (2, 'U Launcher Lite – FREE Live Cool Themes, Hide Apps', Decimal('4.7'), 87510, 5000000, '0.0', 'Everyone', 'Art & Design'),
 (3, 'Sketch - Draw & Paint', Decimal('4.5'), 215644, 50000000, '0.0', 'Teen', 'Art & Design'),
 (4, 'Pixel Draw - Number Art Coloring Book', Decimal('4.3'), 967, 100000, '0.0', 'Everyone', 'Art & Design;Creativity'),
 (5, 'Paper flowers instructions', Decimal('4.4'), 167, 50000, '0.0', 'Everyone', 'Art & Design'),
 (6, 'Smoke Effect Photo Maker - Smoke Editor', Decimal('3.8'), 178, 50000, '0.0', 'Everyone', 'Art & Design'),
 (7, 'Infinite Painter', Decimal('4.1'), 36815, 1000000, '0.0', 'Everyone', 'Art & Design'),
 (8, 'Garden Coloring Book', Decimal('4.4'), 13791, 1000000, '0.0', 'Everyone', 'Art & Design'),
 (9, 'Kids Paint Free - Drawi

In [51]:
# Query the database
engine.execute(
    """SELECT *
    FROM reviews
    LIMIT 10""").fetchall()

[(0, '10 Best Foods for You', 'Positive', Decimal('1.0'), Decimal('0.533333333'), None),
 (1, '10 Best Foods for You', 'Positive', Decimal('0.25'), Decimal('0.288461538'), None),
 (2, '10 Best Foods for You', 'Positive', Decimal('0.4'), Decimal('0.875'), None),
 (3, '10 Best Foods for You', 'Positive', Decimal('1.0'), Decimal('0.3'), None),
 (4, '10 Best Foods for You', 'Positive', Decimal('1.0'), Decimal('0.3'), None),
 (5, '10 Best Foods for You', 'Positive', Decimal('0.6'), Decimal('0.9'), None),
 (6, '10 Best Foods for You', 'Neutral', Decimal('0.0'), Decimal('0.0'), None),
 (7, '10 Best Foods for You', 'Neutral', Decimal('0.0'), Decimal('0.0'), None),
 (8, '10 Best Foods for You', 'Positive', Decimal('0.7'), Decimal('0.6'), None),
 (9, '10 Best Foods for You', 'Positive', Decimal('0.2'), Decimal('0.1'), None)]

In [52]:
# Join the two tables to get results
engine.execute(
    """SELECT a.app, r.sentiment, a.reviews
    FROM apps as a
    INNER JOIN reviews as r on
    a.app = r.app
    ORDER BY a.reviews DESC
    LIMIT 10""").fetchall()

[('Facebook', 'Positive', 78158306),
 ('Facebook', 'Negative', 78158306),
 ('Facebook', 'Negative', 78158306),
 ('Facebook', 'Negative', 78158306),
 ('Facebook', 'Negative', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Neutral', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Neutral', 78158306),
 ('Facebook', 'Negative', 78158306)]

In [53]:
# Select app name and id to find ids to make foreign ids in reviews table
foreign_ids = engine.execute(
    """SELECT r.app, a.id
    from apps as a
    inner join reviews as r on
    a.app = r.app;""").fetchall()

In [68]:
# Create a foreign id for reviews_df
foreign_id_df = pd.DataFrame(foreign_ids)
foreign_id_df = foreign_id_df.rename(columns={0: "app", 1: "app_id"})
foreign_id_df.to_csv("../../foreign_id.csv")
foreign_id_df

Unnamed: 0,app,app_id
0,10 Best Foods for You,1327
1,10 Best Foods for You,1327
2,10 Best Foods for You,1327
3,10 Best Foods for You,1327
4,10 Best Foods for You,1327
...,...,...
35924,Housing-Real Estate & Property,1554
35925,Housing-Real Estate & Property,1554
35926,Housing-Real Estate & Property,1554
35927,Housing-Real Estate & Property,1554


In [72]:
# Reset index to merge properly
reviews_df.reset_index(inplace=True)
reviews_df

Unnamed: 0,id,app,sentiment,sentiment_polarity,sentiment_subjectivity
0,0,10 Best Foods for You,Positive,1.000000,0.533333
1,1,10 Best Foods for You,Positive,0.250000,0.288462
2,2,10 Best Foods for You,Positive,0.400000,0.875000
3,3,10 Best Foods for You,Positive,1.000000,0.300000
4,4,10 Best Foods for You,Positive,1.000000,0.300000
...,...,...,...,...,...
37422,37422,Housing-Real Estate & Property,Positive,0.173333,0.486667
37423,37423,Housing-Real Estate & Property,Positive,0.225000,0.447222
37424,37424,Housing-Real Estate & Property,Negative,-0.287500,0.250000
37425,37425,Housing-Real Estate & Property,Positive,0.800000,1.000000


In [79]:
# Merge tables on app name to add foreign key
merged_reviews_df = pd.merge(reviews_df, foreign_id_df, how="outer", on="app", left_index=True, right_index=True)

# Drop N/A values, change app_id to int, set index back to id
merged_reviews_df.dropna(how="any",inplace=True)
merged_reviews_df["app_id"] = merged_reviews_df["app_id"].astype(int)
merged_reviews_df.set_index("id", inplace=True)
merged_reviews_df

Unnamed: 0_level_0,app,sentiment,sentiment_polarity,sentiment_subjectivity,app_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,10 Best Foods for You,Positive,1.000000,0.533333,1327
1,10 Best Foods for You,Positive,0.250000,0.288462,1327
2,10 Best Foods for You,Positive,0.400000,0.875000,1327
3,10 Best Foods for You,Positive,1.000000,0.300000,1327
4,10 Best Foods for You,Positive,1.000000,0.300000,1327
...,...,...,...,...,...
35924,High-Powered Flashlight,Positive,0.104167,0.400000,1554
35925,High-Powered Flashlight,Positive,0.682969,0.600000,1554
35926,High-Powered Flashlight,Positive,0.600000,0.575000,1554
35927,High-Powered Flashlight,Positive,0.400000,0.700000,1554


In [80]:
# Drop the reviews table: reset
engine.execute("DROP TABLE reviews")
merged_reviews_df.to_sql(name='reviews', con=engine, if_exists='append', index=True)

# Query the database again
engine.execute(
    """SELECT *
    FROM reviews
    LIMIT 10""").fetchall()

[(0, '10 Best Foods for You', 'Positive', 1.0, 0.533333333, 1327),
 (1, '10 Best Foods for You', 'Positive', 0.25, 0.288461538, 1327),
 (2, '10 Best Foods for You', 'Positive', 0.4, 0.875, 1327),
 (3, '10 Best Foods for You', 'Positive', 1.0, 0.3, 1327),
 (4, '10 Best Foods for You', 'Positive', 1.0, 0.3, 1327),
 (5, '10 Best Foods for You', 'Positive', 0.6, 0.9, 1327),
 (6, '10 Best Foods for You', 'Neutral', 0.0, 0.0, 1327),
 (7, '10 Best Foods for You', 'Neutral', 0.0, 0.0, 1327),
 (8, '10 Best Foods for You', 'Positive', 0.7, 0.6, 1327),
 (9, '10 Best Foods for You', 'Positive', 0.2, 0.1, 1327)]

In [81]:
# Join the two tables to get results
engine.execute(
    """SELECT a.app, r.sentiment, a.reviews
    FROM apps as a
    INNER JOIN reviews as r on
    a.id = r.app_id
    ORDER BY a.reviews DESC
    LIMIT 10""").fetchall()

[('Facebook', 'Negative', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Positive', 78158306),
 ('Facebook', 'Negative', 78158306)]