# Imports, Options, and Getting Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from dataprep.eda import plot, plot_correlation, plot_missing
from sqlalchemy import create_engine

In [None]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 100)
pd.set_option("display.precision", 5)
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 4, 'font.family': 'sans'})
%matplotlib

In [None]:
def load_dataframe_from_yelp_2(query):
    """
    Connects to yelp_2 database on Postgres and
    loads a Pandas dataframe based off sql query.

    Args:
        query (string): Sql query to select data from yelp_2.

    Returns:
        Dataframe: Pandas dataframe of records
                    from sql query of yelp_2 database.
    """
    connect = 'postgresql+psycopg2://postgres:password@localhost:5432/yelp_2'
    engine = create_engine(connect)
    df = pd.read_sql(query, con=engine)
    df = df.copy()
    return df

In [None]:
query = '''
        SELECT *
        FROM test4
        LIMIT 10000
        ;
        '''
df = load_dataframe_from_yelp_2(query)

In [None]:
df = df.drop_duplicates(subset=['review_id']).iloc[:10000, :]
df.info()

In [None]:
dataset_release_date = pd.to_datetime('2020-3-25 19:13:01')

## Preparing the Data for EDA

In [None]:
df['business_categories'] = df['business_categories'].apply(lambda x: x.split(', '))

## Splitting Dataframe on T2_CLS_ufc_>0

In [None]:
full_df = df.copy()
useful_reviews = df[df['T2_CLS_ufc_>0'] == True]
non_useful_reviews = df[df['T2_CLS_ufc_>0'] == False]

# EDA

## Basic Pandas Tools and Dataprep.eda Column Exploration

In [None]:
non_useful_reviews.info()

In [None]:
plot(df, 'user_review_count_TD')

In [None]:
df.describe()

## Pandas Profiling

In [None]:
profile = df.profile_report(minimal=True)
profile

## Dataprep.eda

plot(df): “I want an overview of the dataset”
plot(df, “col_1”): “I want to understand the column col_1”
plot(df, “col_1”, “col_2”): “I want to understand the relationship between columns col_1 and col_2”

In [None]:
plot(df)

## Correlation Matrix

In [None]:
non_numeric_columns = ['review_id', 'T2_CLS_ufc_>0',
                       'T3_CLS_ufc_level', 'T5_CLS_ufc_level_TD']
numeric_columns = df.drop(labels=non_numeric_columns, axis=1)
correlation_matrix = (numeric_columns.corr() * 100).round()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(10, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, robust=True, annot=True, center=0,
            square=False, linewidths=.5, cbar=True, cbar_kws={"shrink": .5})
fig.tight_layout()
plt.show()