# Data Cleaning

### Imports

In [2]:
import pandas as pd
import numpy as np
import os

**Load in** fight_songs.csv from the data folder as a dataframe

In [3]:
data_path = os.path.join(os.pardir, os.pardir, "data", "fight_songs.csv")
fight_songs = pd.read_csv(data_path)

fight_songs.head()

Unnamed: 0,school,conference,song_name,writers,year,student_writer,official_song,contest,bpm,sec_duration,...,win_won,victory_win_won,rah,nonsense,colors,men,opponents,spelling,trope_count,spotify_id
0,Notre Dame,Independent,Victory March,Michael J. Shea and John F. Shea,1908,No,Yes,No,152,64,...,Yes,Yes,Yes,No,Yes,Yes,No,No,6,15a3ShKX3XWKzq0lSS48yr
1,Baylor,Big 12,Old Fight,Dick Baker and Frank Boggs,1947,Yes,Yes,No,76,99,...,Yes,Yes,No,No,Yes,No,No,Yes,5,2ZsaI0Cu4nz8DHfBkPt0Dl
2,Iowa State,Big 12,Iowa State Fights,"Jack Barker, Manly Rice, Paul Gnam, Rosalind K...",1930,Yes,Yes,No,155,55,...,No,No,Yes,No,No,Yes,No,Yes,4,3yyfoOXZQCtR6pfRJqu9pl
3,Kansas,Big 12,I'm a Jayhawk,"George ""Dumpy"" Bowles",1912,Yes,Yes,No,137,62,...,No,No,No,Yes,No,Yes,Yes,No,3,0JzbjZgcjugS0dmPjF9R89
4,Kansas State,Big 12,Wildcat Victory,Harry E. Erickson,1927,Yes,Yes,No,80,67,...,No,Yes,No,No,Yes,No,No,No,3,4xxDK4g1OHhZ44sTFy8Ktm


Notice that the `Year` column has **some weird values** in it, and is an object dtype (specifically, a string)

In [4]:
print(fight_songs.year.value_counts().head())

type(fight_songs['year'][0])

Unknown    5
1915       4
1912       4
1919       3
1950       3
Name: year, dtype: int64


str

Write a quick function to **turn the value `"Unknown"` into `np.nan`**, wherever it appears in the dataframe.  

In [5]:
def turn_value_null(frame, value):
    '''
    data cleaning: turn argument value to null
    
    input: 
        frame: dataframe
        value_to_nan: specific value to turn to np.nan
        
    output: frame w/ all values of value_to_nan replaced w/ np.nan
    '''
    frame = frame.replace(value, np.nan)
    return frame


fight_songs = turn_value_null(fight_songs, 'Unknown')

print(f'fight_songs now has {fight_songs.year.isnull().sum()} nulls')

fight_songs now has 5 nulls


Now, write a function that **removes all the nulls**.

In [6]:
def drop_nulls(frame):
    '''
    data cleaning: drop rows w/ np.nan anywhere in frame
    
    input: dataframe 
    output: dataframe w/ rows w/ np.nan dropped
    '''
    
    frame = frame.dropna(axis=0, how="any")
    
    return frame

fight_songs = drop_nulls(fight_songs)

fight_songs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 64
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   school           60 non-null     object
 1   conference       60 non-null     object
 2   song_name        60 non-null     object
 3   writers          60 non-null     object
 4   year             60 non-null     object
 5   student_writer   60 non-null     object
 6   official_song    60 non-null     object
 7   contest          60 non-null     object
 8   bpm              60 non-null     int64 
 9   sec_duration     60 non-null     int64 
 10  fight            60 non-null     object
 11  number_fights    60 non-null     int64 
 12  victory          60 non-null     object
 13  win_won          60 non-null     object
 14  victory_win_won  60 non-null     object
 15  rah              60 non-null     object
 16  nonsense         60 non-null     object
 17  colors           60 non-null     obje

Finally, write a function to **turn the `type` of the `year` column into an `int`**

In [7]:
def turn_column_int(column):
    '''
    data cleaning: turn column to float type
    
    input: column from dataframe
    output: column as float type
    '''
    column = column.astype(int)
    return column

fight_songs['year'] = turn_column_int(fight_songs['year'])

Now, write a function that **loads fight_songs.csv** into a dataframe and returns it.

In [8]:
def load_fight_songs():
    
    '''
    loads in fight_songs.csv from the data folder using pd.read_csv
    
    outputs: dataframe of fight_songs.csv
    '''
    
    data_path = os.path.join(os.pardir, os.pardir, "data", "fight_songs.csv")
    df = pd.read_csv(data_path)
    
    return df

## Composite Function

**Write a function** (which doesn't take in any parameters) that:
- **calls** `load_fight_songs`, `turn_value_null`, `drop_nulls`, and `turn_column_int` **sequentially**
    - (make sure to include all the specific parameters of those functions called above which are necessary to make them run)
    
    
- **returns** a dataframe at the end

It should be ***the same columns, rows and data*** as the dataframe we ended up with above

In [9]:
def load_clean_fight_songs():
    '''
    runs sequentially:
        load_fight_songs() 
            - loads fight_songs.csv
        
        df = turn_value_null(df, 'Unknown') 
            - turns values "Unknown" to np.nan
        
        df = drop_nulls(df)
            - drops null rows from df
            
        df['year'] = turn_column_float(df['year'])
            - turns 'year' column to float type
            
    result:
        fight_songs.csv loaded and cleaned
    '''
    
    
    df = load_fight_songs()
    df = turn_value_null(df, 'Unknown')
    df = drop_nulls(df)
    df['year'] = turn_column_int(df['year'])
    
    return df

In [10]:
df = load_clean_fight_songs()
df.head()

Unnamed: 0,school,conference,song_name,writers,year,student_writer,official_song,contest,bpm,sec_duration,...,win_won,victory_win_won,rah,nonsense,colors,men,opponents,spelling,trope_count,spotify_id
0,Notre Dame,Independent,Victory March,Michael J. Shea and John F. Shea,1908,No,Yes,No,152,64,...,Yes,Yes,Yes,No,Yes,Yes,No,No,6,15a3ShKX3XWKzq0lSS48yr
1,Baylor,Big 12,Old Fight,Dick Baker and Frank Boggs,1947,Yes,Yes,No,76,99,...,Yes,Yes,No,No,Yes,No,No,Yes,5,2ZsaI0Cu4nz8DHfBkPt0Dl
2,Iowa State,Big 12,Iowa State Fights,"Jack Barker, Manly Rice, Paul Gnam, Rosalind K...",1930,Yes,Yes,No,155,55,...,No,No,Yes,No,No,Yes,No,Yes,4,3yyfoOXZQCtR6pfRJqu9pl
3,Kansas,Big 12,I'm a Jayhawk,"George ""Dumpy"" Bowles",1912,Yes,Yes,No,137,62,...,No,No,No,Yes,No,Yes,Yes,No,3,0JzbjZgcjugS0dmPjF9R89
4,Kansas State,Big 12,Wildcat Victory,Harry E. Erickson,1927,Yes,Yes,No,80,67,...,No,Yes,No,No,Yes,No,No,No,3,4xxDK4g1OHhZ44sTFy8Ktm


## .py Files

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [15]:
import src.data_cleaning as data_cleaning

data cleaning imported


In [19]:
df_from_py = data_cleaning.load_clean_fight_songs()

In [20]:
df_from_py.head()

Unnamed: 0,school,conference,song_name,writers,year,student_writer,official_song,contest,bpm,sec_duration,...,win_won,victory_win_won,rah,nonsense,colors,men,opponents,spelling,trope_count,spotify_id
0,Notre Dame,Independent,Victory March,Michael J. Shea and John F. Shea,1908,No,Yes,No,152,64,...,Yes,Yes,Yes,No,Yes,Yes,No,No,6,15a3ShKX3XWKzq0lSS48yr
1,Baylor,Big 12,Old Fight,Dick Baker and Frank Boggs,1947,Yes,Yes,No,76,99,...,Yes,Yes,No,No,Yes,No,No,Yes,5,2ZsaI0Cu4nz8DHfBkPt0Dl
2,Iowa State,Big 12,Iowa State Fights,"Jack Barker, Manly Rice, Paul Gnam, Rosalind K...",1930,Yes,Yes,No,155,55,...,No,No,Yes,No,No,Yes,No,Yes,4,3yyfoOXZQCtR6pfRJqu9pl
3,Kansas,Big 12,I'm a Jayhawk,"George ""Dumpy"" Bowles",1912,Yes,Yes,No,137,62,...,No,No,No,Yes,No,Yes,Yes,No,3,0JzbjZgcjugS0dmPjF9R89
4,Kansas State,Big 12,Wildcat Victory,Harry E. Erickson,1927,Yes,Yes,No,80,67,...,No,Yes,No,No,Yes,No,No,No,3,4xxDK4g1OHhZ44sTFy8Ktm
