# Introduction to Exploritary Data Analysis
Using Pandas!

## Imports and Reading Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns',200)

In [None]:
df = pd.read_csv(r"C:\Users\Krishna\OneDrive\Desktop\python\study\panda\coaster_db.csv")

## Data Understanding
- Dataframe `shape`
- `head` and `tail`
- `dtypes`
- `describe`


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
df.dtypes

In [None]:
df.describe()

## Data Preperation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming Columns
- Feature Creation

In [None]:
# Example of dropping columns
# df.drop(['Opening date'], axis=1)

In [None]:
df.columns

In [None]:
 
df = df[['coaster_name','Speed', 'Location','year_introduced',
'opening_date_clean','Status','Height',  'speed_mph',
'height_ft','Inversions_clean', 'Gforce_clean']].copy()
df

In [None]:
df.dtypes

In [None]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [None]:
#Rename our columns
df.rename(columns={'coaster_name':'Coster_Name','year_introduced':'Year_Introduced' ,
'opening_date_clean':'Opening_Date_Clean',  'speed_mph':'Speed_mph',
'height_ft':'Height_ft','Inversions_clean':'Inversions_Clean', 'Gforce_clean':'Gforce_Clean'})

In [None]:
df.isna().sum()

In [None]:
#checking for duplicate values
df.loc[df.duplicated()]

In [None]:
# # Check for duplicate coaster name
df.columns

In [None]:
# removing the duplicates 
df = df.loc[~df.duplicated()] \
    .reset_index(drop=True).copy()

In [None]:
df['year']=df["year_introduced"].copy()

## Feature Understanding


- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

In [None]:
df['year'].value_counts()

In [None]:
ax = df['year'].value_counts() \
    .head(10) \
    .plot(kind='bar', title='Top 10 Years Coasters Introduced')
ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')

In [None]:
ax = df['speed_mph'].plot(kind='hist',
                          bins=20,
                          title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')

## Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby comparisons

In [None]:
df.plot(kind='scatter',
        x='Speed_mph',
        y='Height_ft',
        title='Coaster Speed vs. Height')
plt.show()

In [None]:
ax = sns.scatterplot(x='Speed_mph',
                y='Height_ft',
                hue='Year_Introduced',
                data=df)
ax.set_title('Coaster Speed vs. Height')
plt.show()

In [None]:
sns.pairplot(df,
             vars=['coaster_name','Speed', 'Location','year_introduced',
'opening_date_clean'],
            hue='Type_Main')
plt.show()

In [None]:
df_corr = df[['coaster_name','Speed', 'Location','year_introduced',
'opening_date_clean']].dropna().corr()
df_corr

## Ask a Question about the data
- Try to answer a question you have about the data using a plot or statistic.

In [None]:
ax = df.query('Location != "Other"') \
    .groupby('Location')['Speed_mph'] \
    .agg(['mean','count']) \
    .query('count >= 10') \
    .sort_values('mean')['mean'] \
    .plot(kind='barh', figsize=(12, 5), title='Average Coast Speed by Location')
ax.set_xlabel('Average Coaster Speed')
plt.show()