# Final Review

In [None]:
import numpy as np
import pandas as pd

## Data frames: Masking

In [None]:
flights = pd.read_csv('./data/flight_sample.csv')
flights.head()

In [None]:
print(flights['AIRLINE'] == 'AA')
aa_flights = flights[ flights['AIRLINE'] == 'AA' ]
aa_flights

In [None]:
flight_count2 = np.sum(flights['AIRLINE'] == 'AA')

print(np.sum(flights['AIRLINE'] == 'AA'))
print((flights['AIRLINE'] == 'AA').sum())
print( "there are {} AA flights in the set".format(flight_count2))
# why do we use sum() ^^
print(aa_flights.shape)
print(aa_flights.shape[0])
aa_flights.head()

In [None]:
aa_flights_long = flights[ (flights['AIRLINE'] == 'AA') & (flights['DISTANCE'] > 1000) ]
# what if you wanted to filter on flights less than 1000 or greater than 2000?
aa_flights_long.head()

## Aggregate Functions

In [None]:
# what is broadcasting and how does it explain this result?
aa_flights.mean()

In [None]:
print(aa_flights['TAXI_OUT'].max())

print("%%%%%%%%%%%%%%%")
print(aa_flights[['TAXI_OUT', 'TAXI_IN']].max())

# how is this different than above? Why is this not great?
print("%%%%%%%%%%%%%%%")
print(aa_flights.max()[['TAXI_OUT', 'TAXI_IN']])

print("%%%%%%%%%%%%%%%")
print(aa_flights.max())

In [None]:
# Get unique list of Airlines
print(flights['AIRLINE'].unique())

# get count of unique airlines
print(len(flights['AIRLINE'].unique()))
print(flights['AIRLINE'].nunique())

## aggregate()

What was the aggregate() function and how was it more flexible than just calling mean() or median(). Wasn't there something about renaming columns as well?

## Dataframes: Indexes

In [None]:
scores = pd.read_csv('./data/college-scorecard-data-scrubbed.csv', encoding='latin-1')
# what if you want to set the index as you load the data frame ^^
# what does 'encoding' do?

scores.set_index('institution_name', inplace=True)
scores.head()

In [None]:
scores = pd.read_csv('./data/college-scorecard-data-scrubbed.csv', encoding='latin-1')
scores.set_index('institution_name', inplace=True)

# use loc[] to access row (Panda Series) by index
scores.loc['University of Notre Dame']

In [None]:
# drill down even further
scores.loc['University of Notre Dame']['url']

In [None]:
# if you are returning to this specific row, you can store it for easy access
nd = scores.loc['University of Notre Dame']
print("Notre Dame's top sat reading and math percentile is {} and {} respectively...".format(
        nd['sat_reading_75'], 
        nd['sat_math_75'])
     )

In [None]:
# You can slice via index with both numerical values and strings (also dates)
# what is different between implicit and explicit indexes when slicing?
scores.loc['University of Chicago':'University of Notre Dame']

In [None]:
scores = pd.read_csv('./data/college-scorecard-data-scrubbed.csv', encoding='latin-1')
scores_multi_indexed = scores.set_index(['state', 'institution_name'])
scores_multi_indexed
# scores_multi_indexed.loc['IN']
# scores_multi_indexed.loc['MI']
# scores_multi_indexed.loc['IN'].loc['University of Notre Dame']

## Dropping and Filling Missing Data

In [None]:
scores = pd.read_csv('./data/college-scorecard-data-scrubbed.csv', encoding='latin-1', index_col="institution_name")
# Dropping columns
print("Shape of entire scores DF", scores.shape)

# scores.drop('white_percentage', inplace=True, axis=1)
# scores.drop(['black_percentage',
#        'hispanic_percent', 'asian_precent',
#        'american_indian_or_alaskan_native_precent',
#        'native_hawaiian_pacific_islander_percentage',
#        'two_or_more_races_percentage', 'non_resident_alients_percentage',
#        'unknown_percentage'], inplace=True, axis=1)

# print(scores.shape)
# scores.head()

In [None]:
# working with just a subset, this might be esasier
scores_small = scores[['city','state','url', 'predominant_degree_desc', 'sat_math_25', 'sat_reading_25', 'sat_math_75', 'sat_reading_75']].copy(deep=True)
print(scores_small.shape)

In [None]:
# see how many null values we have...
scores_small.isnull().sum()

# scores_small_clean = scores_small.dropna().copy()
# print(scores_small.shape)
# print(scores_small_clean.shape)
# scores_small_clean.isnull().sum()

In [None]:
# if you drop na on a dataset, it will drop rows that have ANY missing data
print("entire df", scores.isnull().sum())
print("\nentire df", scores.shape)
print("\ndrop all in entire df", scores.dropna().shape)

print("\nsmall df", scores_small.shape)
scores_clean = scores_small.dropna().copy(deep=True)
print("\ndropna on entire 'clean'' df", scores_clean.shape)

In [None]:
avg_sat_math = int(scores_small['sat_math_75'].mean())
print(avg_sat_math)
scores_small['sat_math_75'].fillna(avg_sat_math, inplace=True)
scores_small['sat_reading_75'].fillna(0, inplace=True)
scores_small.fillna(scores_small.mean(), inplace=True)

print("fillna() on entire small df", scores_small.shape)

In [None]:
scores_small.sample(5)

## Composing New Series/Columns 

In [None]:
# create new column that totals the reading and math
scores_clean['sat_total_75'] = scores_clean['sat_math_75'] + scores_clean['sat_reading_75']

# get the overal average
overall_sat_mean = scores_clean['sat_total_75'].mean()
print(overall_sat_mean)

# create a new column that diffs the total for each school
scores_clean['sat_diff_75'] = scores_clean['sat_total_75'] - overall_sat_mean
scores_clean.sample(5)


### idxmax() and idxmin()

In [None]:
highest_school = scores_clean['sat_diff_75'].idxmax()
lowest_school = scores_clean['sat_diff_75'].idxmin()

print("{} has the highest score with {} ({})".format(highest_school, scores_clean.loc[highest_school]['sat_total_75'], scores_clean.loc[highest_school]['sat_diff_75']))

low_school_data = scores_clean.loc[lowest_school]

print("{} has the lowest score with {} ({})".format(lowest_school, low_school_data['sat_total_75'], low_school_data['sat_diff_75']))

In [None]:
print(scores.loc[highest_school][['city', 'url', 'sat_reading_75', 'sat_reading_75']])
print(scores.loc[lowest_school][['city', 'url', 'sat_reading_75', 'sat_reading_75']])

## Using Apply() to compose new columns in a data frame

In [None]:
# create the column for summed sat scores
scores_clean['sat_total_25'] = scores_clean['sat_math_25'] + scores_clean['sat_reading_25']

# get overal average
overall_sat_mean_25 = scores_clean['sat_total_25'].mean()

def calculate_diff(total):
    return total - overall_sat_mean_25


In [None]:
# use function to calculate diff
scores_clean['sat_diff_25'] = scores_clean['sat_total_25'].apply(calculate_diff)

highest_score = scores_clean['sat_diff_25'].idxmax()
lowest_score = scores_clean['sat_diff_25'].idxmin()

print("{} has the highest score with {} ({})".format(highest_score, scores_clean.loc[highest_score]['sat_total_25'], scores_clean.loc[highest_score]['sat_diff_25']))

low_school = scores_clean.loc[lowest_score]

print("{} has the lowest score with {} ({})".format(lowest_score, low_school['sat_total_25'], low_school['sat_diff_25']))
scores_clean.head()

nd = scores_clean.loc['University of Notre Dame']
print("by the way... Notre dames numbers are 75: {} | {} and 25: {} | A diff of {}".format(int(nd['sat_total_75']), int(nd['sat_diff_75']), int(nd['sat_total_25']), int(nd['sat_diff_25'])))


### Group By

In [None]:
flights = pd.read_csv('./data/flight_sample.csv')
flights.head()

In [None]:
# simple group by
flights_by_airline = flights.groupby(['AIRLINE'])
flights_by_airline.groups
# flights_by_airline.head()
# flights_by_airline.mean()
# flights_by_airline.mean().loc['AA']

In [None]:
# group by on mulitple columns
flights_by_airline_and_month = flights.groupby(['AIRLINE', 'MONTH'])
flights_by_airline_and_month.max()[:5]

# flights_by_airline_and_month.mean().loc['AA'].loc[3]
# flights_by_airline_and_month.mean().loc['AA',3]
# flights_by_airline_and_month.mean().loc['AA'].loc[3:9]['DISTANCE']
# flights_by_airline_and_month.mean().loc['AA'].loc[3:9]['DISTANCE'].idxmax()

In [None]:
# what is the longest flight AA flew in July 2015 
# and now much longer (or shorter) than in December of that year

print("July: ", flights_by_airline_and_month.max().loc['AA', 7]['DISTANCE'])
print("December: ", flights_by_airline_and_month.max().loc['AA', 12]['DISTANCE'])
print(flights_by_airline_and_month.max().loc['AA', 7]['DISTANCE'] - flights_by_airline_and_month.max().loc['AA', 12]['DISTANCE'])

In [None]:
# How would you sort a dataframe by a specific column?

## Pivot Tables

In [None]:
flights.head()

In [None]:
flight_pvt = flights.pivot_table('DISTANCE',index='DAY_OF_WEEK', columns = 'AIRLINE')
# flight_pvt = flights.pivot_table('DISTANCE',index='DAY_OF_WEEK', columns = 'AIRLINE', aggfunc=[np.max, np.min])

flight_pvt


## Time Series

In [None]:
goog = pd.read_csv("./data/Google_Stock_Price.csv")
goog.head()

In [None]:
# convert column to a date object
goog['Date'] = pd.to_datetime(goog['Date'])
print(goog.dtypes)

goog.set_index('Date', inplace=True)
goog.head()

In [None]:
# Get rows based on year
goog.loc['2007'][:3]

# Get rows based on month
goog.loc['March 2007'][:3]

# Slice frame by date
goog.loc['2007-3-23': '2008-3-23':7]

In [None]:
goog_end_year = goog.asfreq('A')
goog_end_year
goog_avg = goog.resample('BM').mean()
goog_avg
goog_res = goog.resample('Q').mean().loc['2010': '2014']
goog_res

# what was that about rolling? 

## Line Plots


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

# what is seaborn? what is it used for?

In [None]:
roster = pd.read_csv('./data/nd-football-2018-roster.csv', index_col='Number')
roster.head()

In [None]:
figure,axes = plt.subplots()
# what are the other ways to style the lines?
axes.plot(roster.index, roster['Height'], label="Height")

# what are the different ways you can style the chart and legend?
axes.set_xlabel('Jersy Number')
axes.set_ylabel('Height')
axes.set_title('Players by Height')
axes.legend()


# how do you limit the size of the x and y coordinates? 'xlim' something?

In [None]:
figure,axes = plt.subplots(figsize=(12,6))
axes.plot(roster.index, roster['Height'], label="Height")
axes.plot(roster.index, roster['Weight'], label="Weight")

axes.set_xlabel('Jersy Number')
axes.set_ylabel('Height')
axes.set_title('Players by Height and Weight')
axes.legend()

## Multiple Subplots

In [None]:
figure,axes = plt.subplots(1,2, figsize=(12,6))
axes[0].plot(roster.index, roster['Height'], label="Height")

axes[0].set_xlabel('Jersy Number')
axes[0].set_ylabel('Height')
axes[0].set_title('Players by Height')
axes[0].legend()

axes[1].plot(roster.index, roster['Weight'], label="Weight", color='Green')

axes[1].set_xlabel('Jersy Number')
axes[1].set_ylabel('Weight')
axes[1].set_title('Players by Weight')
axes[1].legend()

# What is sharey=True and when/why would you use it

In [None]:
# Scatter Plots

figure,axes = plt.subplots(figsize=(8,4))
img = axes.scatter(
    roster['Weight'], roster['Height'],
    c=roster['Height'], cmap='coolwarm'
)

axes.set_title("Relationship between Weight and Height")
axes.set_xlabel('Height')
axes.set_ylabel('Weight')

figure.colorbar(img, label="the taller they get...")

## Histograms

In [None]:
figure, axes = plt.subplots()
axes.hist(roster['Height'], bins=12)
# what does bins do? In this case is 18 a good number

In [None]:
# create a mask with the OR bitwise operator
roster[( roster['Class'] == 'Sr.') | (roster['Class'] == 'Jr.')][:5]

In [None]:
figure, axes = plt.subplots()
upper = roster[( roster['Class'] == 'Sr.') | (roster['Class'] == 'Jr.')]
under = roster[~( roster['Class'] == 'Sr.') & ~(roster['Class'] == 'Jr.')]

axes.hist(upper['Weight'], label="Upper Classman", alpha=1, color="blue")
axes.hist(under['Weight'], label="Under Classman", alpha=.7, color="gold")

axes.legend()
axes.set_title("Weight of Upper vs Under Classman")
axes.set_ylabel("Number of Players")
axes.set_xlabel("Weight")

## 2D Histogram

In [None]:
flights_clean = flights.dropna()
print(flights_clean.shape)

figure, axes = plt.subplots()
image = axes.hist2d(
    flights_clean[ (flights_clean['AIRLINE'] == 'AA')]['DISTANCE'],
    flights_clean[ (flights_clean['AIRLINE'] == 'AA')]['TAXI_OUT'],
    cmin=1,
    cmap='coolwarm'
)

axes.set_xlabel("Distane")
axes.set_ylabel("Taxi Out")

## Bar Plots

In [None]:
flights_by_airline_and_month.sum().head()

In [None]:
# gonna be using this...
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']

In [None]:
figure,axes = plt.subplots()
aa_flights = flights_by_airline_and_month.sum().loc['AA']
axes.bar(range(len(aa_flights.index)), aa_flights['DISTANCE'])

axes.set_xticks(range(len(aa_flights.index)))
axes.set_xticklabels(months, rotation=70)

axes.set_title('American Airlines Total Distance By Month')
axes.set_ylabel('Total Distance')
axes.set_xlabel('Months')

In [None]:

figure,axes = plt.subplots(figsize=(12,8))

offset = .3
axes.bar(np.arange(12) - offset, flights_by_airline_and_month.sum().loc['AA']['DISTANCE'], width=offset, label='American')
axes.bar(np.arange(12), flights_by_airline_and_month.sum().loc['WN']['DISTANCE'], width=offset, label='Southwest')
axes.bar(np.arange(12) + offset, flights_by_airline_and_month.sum().loc['HA']['DISTANCE'], width=offset, label='Hawaiian')

axes.set_xticks(np.arange(12))
axes.set_xticklabels(months, rotation=70)

axes.set_title('American Airlines Total Distance By Month')
axes.set_ylabel('Total Distance')
axes.set_xlabel('Months')

axes.legend()

## Back to Time

In [None]:
fig,axes = plt.subplots()
axes.plot(goog_avg.index, goog_avg['High'], label='High')
axes.plot(goog_avg.index, goog_avg['Low'], label='Low')

axes.legend()

In [None]:
fig,axes = plt.subplots()
axes.plot(goog_res.index, goog_res['High'], label='High')
axes.plot(goog_res.index, goog_res['Low'], label='Low')
axes.set_xticks(goog_res.index)
axes.set_xticklabels(goog_res.index, rotation=90)
axes.legend()