# Chapter 3: Beginning Data Analysis

## Recipes
* [Developing a data analysis routine](#Developing-a-data-analysis-routine)
* [Reducing memory by changing data types](#Reducing-memory-by-changing-data-types)
* [Selecting the smallest of the largest](#Selecting-the-smallest-of-the-largest)
* [Selecting the largest of each group by sorting](#Selecting-the-largest-of-each-group-by-sorting)
* [Replicating nlargest with sort_values](#Replicating-nlargest-with-sort_values)
* [Calculating a trailing stop order price](#Calculating-a-trailing-stop-order-price)

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns = 50

# Developing a data analysis routine

In [None]:
college = pd.read_csv('data/college.csv')

In [None]:
college.head()

In [None]:
college.shape

In [None]:
with pd.option_context('display.max_rows', 8):
    display(college.describe(include=[np.number]).T)

In [None]:
college.describe(include=[np.object, pd.Categorical]).T

In [None]:
college.info()

In [None]:
college.describe(include=[np.number]).T

In [None]:
college.describe(include=[np.object, pd.Categorical]).T

## There's more...

In [None]:
with pd.option_context('display.max_rows', 5):
    display(college.describe(include=[np.number], 
                 percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T)

In [None]:
college_dd = pd.read_csv('data/college_data_dictionary.csv')

In [None]:
with pd.option_context('display.max_rows', 8):
    display(college_dd)

# Reducing memory by changing data types

In [None]:
college = pd.read_csv('data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
col2.head()

In [None]:
col2.dtypes

In [None]:
original_mem = col2.memory_usage(deep=True)
original_mem

In [None]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)

In [None]:
col2.dtypes

In [None]:
col2.select_dtypes(include=['object']).nunique()

In [None]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

In [None]:
new_mem = col2.memory_usage(deep=True)
new_mem

In [None]:
new_mem / original_mem

## There's more...

In [None]:
college = pd.read_csv('data/college.csv')

In [None]:
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

In [None]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
# college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

In [None]:
college['MENONLY'].dtype

In [None]:
college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
college.describe(include=['int64', 'float64']).T

In [None]:
college.describe(include=[np.int64, np.float64]).T

In [None]:
college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)

In [None]:
college.describe(include=['int', 'float']).T  # defaults to 64 bit int/floats

In [None]:
college.describe(include=['number']).T  # also works as the default int/float are 64 bits

In [None]:
college['MENONLY'] = college['MENONLY'].astype('float16')
college['RELAFFIL'] = college['RELAFFIL'].astype('int8')

In [None]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage()

# Selecting the smallest of the largest

In [None]:
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

In [None]:
movie2.nlargest(100, 'imdb_score').head()

In [None]:
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

# Selecting the largest of each group by sorting

In [None]:
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'title_year', 'imdb_score']]

In [None]:
movie2.sort_values('title_year', ascending=False).head()

In [None]:
movie3 = movie2.sort_values(['title_year','imdb_score'], ascending=False)
movie3.head()

In [None]:
movie_top_year = movie3.drop_duplicates(subset='title_year')
movie_top_year.head()

In [None]:
movie4 = movie[['movie_title', 'title_year', 'content_rating', 'budget']]
movie4_sorted = movie4.sort_values(['title_year', 'content_rating', 'budget'], 
                                   ascending=[False, False, True])
movie4_sorted.drop_duplicates(subset=['title_year', 'content_rating']).head(10)

# Replicating nlargest with sort_values

In [None]:
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie_smallest_largest = movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')
movie_smallest_largest

In [None]:
movie2.sort_values('imdb_score', ascending=False).head(100).head()

In [None]:
movie2.sort_values('imdb_score', ascending=False).head(100).sort_values('budget').head()

In [None]:
movie2.nlargest(100, 'imdb_score').tail()

In [None]:
movie2.sort_values('imdb_score', ascending=False).head(100).tail()

# Calculating a trailing stop order price

In [None]:
import pandas_datareader as pdr

### Note: pandas_datareader issues
pandas_datareader can have issues when the source is 'google'. It can also read from Yahoo! finance. Try switching it to 'yahoo'

In [None]:
tsla = pdr.DataReader('tsla', data_source='yahoo',start='2017-1-1')
tsla.head(8)

In [None]:
tsla_close = tsla['Close']

In [None]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head(8)

In [None]:
tsla_trailing_stop = tsla_cummax * .9
tsla_trailing_stop.head(8)

## There's more...

In [None]:
def set_trailing_loss(symbol, purchase_date, perc):
    close = pdr.DataReader(symbol, 'yahoo', start=purchase_date)['Close']
    return close.cummax() * perc

In [None]:
msft_trailing_stop = set_trailing_loss('msft', '2017-6-1', .85)
msft_trailing_stop.head()