In [2]:
#Developing a Data Analysis Routine

import pandas as pd
import numpy as np

#import the data 
#college_raw = pd.read_csv(r"C:\Users\Kyle\OneDrive\Desktop\python\pandas_data\college.csv") #PC
college_raw = pd.read_csv("/Users/kylejohnson/Desktop/Python Files/python/Portfolio_Files/pd_cookbook_data/college.csv")
college = college_raw.copy()

#Generating a random sample of the data
college.sample(random_state = 42)

#Looking at the shape
college.shape

#Looking at the info
college.info()

#Get summary stats of numeric cols 
college.describe(include = [np.number]).T

#Get summary stats of object cols
college.describe(include = [np.object_]).T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama A & M University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [11]:
#Reducing Memory by Changing Data Types

#Setting up the data to perform analysis
college_raw = pd.read_csv("/Users/kylejohnson/Desktop/Python Files/python/Portfolio_Files/pd_cookbook_data/college.csv")
college = college_raw.copy()

different_cols = [
    "RELAFFIL",
    "SATMTMID",
    "CURROPER",
    "INSTNM",
    "STABBR"
]

col2 = college.loc[:,different_cols]
col2.head()

#Checking dypes of columns
col2.dtypes

#Finding the memory usage of ea col. with .memory_usage
original_mem = col2.memory_usage(deep = True)
original_mem

#Convert RELAFFIL to 8-bit as only 0 & 1
col2["RELAFFIL"] = col2["RELAFFIL"].astype(np.int8)

#Check dtypes again to verify
col2.dtypes

#Check memory usage again
col2.memory_usage(deep = True)

#Checking for low cardinality in the data
col2.select_dtypes(include = ["object"]).nunique()

#STABBR is a good one as less than 1% are unique
col2["STABBR"] = col2["STABBR"].astype("category")

#compute new memory
new_mem = col2.memory_usage(deep = True)
new_mem

#Compare new memory to old memory
new_mem / original_mem

#Looking at integer and floats # Memory increases for instnm 
college.loc[0,"CURROPER"] = 10000000
college.loc[0,"INSTNM"] = (
    college.loc[0,"INSTNM"] + "a"
)
college[["CURROPER","INSTNM"]].memory_usage(deep = True)

#Using th include method on describe All the following return
# same results
college.describe(include = ['int64','float64']).T
college.describe(include = ['int','float']).T
college.describe(include = ['number']).T



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [16]:
#Selecting the Smallest of the Largest

#Load in movies dataset
movies_raw = pd.read_csv("/Users/kylejohnson/Desktop/Python Files/python/Portfolio_Files/pd_cookbook_data/movie.csv")
movie2 = movies_raw[["movie_title","imdb_score","budget"]]
movie2.head()

#Use .nlargest to select the top 100 movies
# by imdb_score
movie2.nlargest(100,"imdb_score")

#Chain .nsmallest to return the five lowest budget
movie2.nlargest(100,"imdb_score").nsmallest(5,"budget")

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [26]:
#Selecting the Largest of ea. Group by Sorting
movie = movies_raw.copy()
movie[["movie_title","title_year","imdb_score"]]

#Using the sort_values method to sort by title_year
movie[["movie_title","title_year","imdb_score"]].sort_values(
    "title_year",
    ascending = True
)

#To sort multiple columns use a list
movie[["movie_title","title_year","imdb_score"]].sort_values(
    ["title_year","imdb_score"],
    ascending = True
)

#Using .drop_duplicates to keep only the first row of 
# every year
movie[["movie_title","title_year","imdb_score"]].sort_values(
    ["title_year","imdb_score"],
    ascending = True
).drop_duplicates(subset = "title_year")

#multiple ways to do the above. 
#Using the groupby method
(
    movie[["movie_title","title_year","imdb_score"]]
    .groupby("title_year", as_index = False)
    .apply(
        lambda df: df.sort_values(
            "imdb_score", ascending = False
        ).head(1)
    )
    .droplevel(0)
    .sort_values("title_year", ascending = False)
)

#Sort one columbn asc. and one desc. 
(
    movie[
        ["movie_title",
           "title_year",
           "content_rating",
           "budget"]
    ]
    .sort_values(
        ["title_year","content_rating","budget"],
        ascending = [False, False, True]
    )
    .drop_duplicates(
        subset = ["title_year","content_rating"]
    )
)

  .apply(


Unnamed: 0,movie_title,title_year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to the Finish,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
3252,The Wailing,2016.0,Not Rated,
4659,Alleluia! The Devil's Carnival,2016.0,,500000.0
...,...,...,...,...
2558,Lilyhammer,,TV-MA,34000000.0
807,"Sabrina, the Teenage Witch",,TV-G,3000000.0
848,Stargate SG-1,,TV-14,1400000.0
2436,Carlos,,Not Rated,


In [30]:
#Replicating nlargest with sort_values

#Load data, select largest of smallest
(
movie[["movie_title","imdb_score","budget"]]
.nlargest(100,"imdb_score")
.nsmallest(5, "budget")
)

#Use sort_values to replicate
(
    movie[["movie_title","imdb_score","budget"]]
    .sort_values("imdb_score",ascending = False)
    .head(100)
)

#Using sort value with head to select lowest give budget
(
    movie[["movie_title","imdb_score","budget"]]
    .sort_values("imdb_score",ascending = False)
    .head(100)
    .sort_values("budget")
    .head(5)
)

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4801,Children of Heaven,8.5,180000.0
4804,Butterfly Girl,8.7,180000.0
4706,12 Angry Men,8.9,350000.0
4636,The Other Dream Team,8.4,500000.0


In [49]:
#Calculating a Trailing Stop Order Price on Stock Prices

import datetime
import yfinance as yf
import pandas as pd
import pathlib

cache_file = pathlib.Path("tsla.parquet")

if cache_file.exists():
    tsla = pd.read_parquet(cache_file)
else:
    tsla = yf.download(
        "TSLA",
        start="2017-01-01",
        auto_adjust=True,
        progress=False
    )
    tsla.to_parquet(cache_file)

print(tsla.head(8))

#For simplicity we will work with closing price
tsla_close = tsla["Close"]

#Use the .cummax method to track the highest closing point
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

#limit downside to 10% we mult the result by 0.9
# creates the trailing stop order
(tsla['Close'].cummax().mul(0.9).head())

Price           Close       High        Low       Open     Volume
Ticker           TSLA       TSLA       TSLA       TSLA       TSLA
Date                                                             
2017-01-03  14.466000  14.688667  14.064000  14.324000   88849500
2017-01-04  15.132667  15.200000  14.287333  14.316667  168202500
2017-01-05  15.116667  15.165333  14.796667  15.094667   88675500
2017-01-06  15.267333  15.354000  15.030000  15.128667   82918500
2017-01-09  15.418667  15.461333  15.200000  15.264667   59692500
2017-01-10  15.324667  15.466667  15.126000  15.466667   54900000
2017-01-11  15.315333  15.332000  15.112000  15.271333   54762000
2017-01-12  15.306000  15.380000  15.038667  15.270667   56853000


Ticker,TSLA
Date,Unnamed: 1_level_1
2017-01-03,13.0194
2017-01-04,13.6194
2017-01-05,13.6194
2017-01-06,13.7406
2017-01-09,13.8768
