# Kaggle CSV Cleaning

# Cleaned Attendance Data

#### The data csv file found from Kaggle gives us ket statistics of NFL teams.

#### Kaggle Dataset: https://www.kaggle.com/datasets/nickcantalupa/nfl-team-data-2003-2023 

#### This jupiter notebook cleans the original data file of the Kaggle Dataset. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# read the file in 
nfl_team_data = pd.read_csv("team_stats_2003_2023.csv", encoding='utf-8') 
display(nfl_team_data.head())

Unnamed: 0,year,team,wins,losses,win_loss_perc,points,points_opp,points_diff,mov,g,...,rush_td,rush_yds_per_att,rush_fd,penalties,penalties_yds,pen_fd,score_pct,turnover_pct,exp_pts_tot,ties
0,2003,New England Patriots,14,2,0.875,348,238,110,6.9,16,...,9,3.4,91,111,998,26,27.9,11.3,-136.51,
1,2003,Miami Dolphins,10,6,0.625,311,261,50,3.1,16,...,14,3.7,99,103,913,22,28.1,17.2,-177.92,
2,2003,Buffalo Bills,6,10,0.375,243,279,-36,-2.3,16,...,13,3.9,96,106,891,22,21.9,17.6,-230.07,
3,2003,New York Jets,6,10,0.375,283,299,-16,-1.0,16,...,8,4.0,78,69,550,15,32.4,11.8,-107.89,
4,2003,Baltimore Ravens,10,6,0.625,391,281,110,6.9,16,...,18,4.8,115,126,970,23,31.8,16.6,-220.5,


In [4]:
# Make a copy of the data 
nfl_team_data_edited = nfl_team_data.copy()

In [5]:
# work from the copy and display dataframe 
display(nfl_team_data_edited.head())

Unnamed: 0,year,team,wins,losses,win_loss_perc,points,points_opp,points_diff,mov,g,...,rush_td,rush_yds_per_att,rush_fd,penalties,penalties_yds,pen_fd,score_pct,turnover_pct,exp_pts_tot,ties
0,2003,New England Patriots,14,2,0.875,348,238,110,6.9,16,...,9,3.4,91,111,998,26,27.9,11.3,-136.51,
1,2003,Miami Dolphins,10,6,0.625,311,261,50,3.1,16,...,14,3.7,99,103,913,22,28.1,17.2,-177.92,
2,2003,Buffalo Bills,6,10,0.375,243,279,-36,-2.3,16,...,13,3.9,96,106,891,22,21.9,17.6,-230.07,
3,2003,New York Jets,6,10,0.375,283,299,-16,-1.0,16,...,8,4.0,78,69,550,15,32.4,11.8,-107.89,
4,2003,Baltimore Ravens,10,6,0.625,391,281,110,6.9,16,...,18,4.8,115,126,970,23,31.8,16.6,-220.5,


In [6]:
print(nfl_team_data_edited.dtypes)

year                      int64
team                     object
wins                      int64
losses                    int64
win_loss_perc           float64
points                    int64
points_opp                int64
points_diff               int64
mov                     float64
g                         int64
total_yards               int64
plays_offense             int64
yds_per_play_offense    float64
turnovers                 int64
fumbles_lost              int64
first_down                int64
pass_cmp                  int64
pass_att                  int64
pass_yds                  int64
pass_td                   int64
pass_int                  int64
pass_net_yds_per_att    float64
pass_fd                   int64
rush_att                  int64
rush_yds                  int64
rush_td                   int64
rush_yds_per_att        float64
rush_fd                   int64
penalties                 int64
penalties_yds             int64
pen_fd                    int64
score_pc

In [7]:
# edit ties column to be int64 
nfl_team_data_edited['ties'] = nfl_team_data_edited['ties'].astype('Int64')

In [8]:
# view data types again 
print(nfl_team_data_edited.dtypes)

year                      int64
team                     object
wins                      int64
losses                    int64
win_loss_perc           float64
points                    int64
points_opp                int64
points_diff               int64
mov                     float64
g                         int64
total_yards               int64
plays_offense             int64
yds_per_play_offense    float64
turnovers                 int64
fumbles_lost              int64
first_down                int64
pass_cmp                  int64
pass_att                  int64
pass_yds                  int64
pass_td                   int64
pass_int                  int64
pass_net_yds_per_att    float64
pass_fd                   int64
rush_att                  int64
rush_yds                  int64
rush_td                   int64
rush_yds_per_att        float64
rush_fd                   int64
penalties                 int64
penalties_yds             int64
pen_fd                    int64
score_pc

In [9]:
# check the shape of the dataframe 
nfl_team_data_edited.shape

(672, 35)

In [10]:
# Check for duplicates 
duplicate_records = nfl_team_data_edited[nfl_team_data_edited.duplicated(keep=False)]

# keep=False will identify and display every row that has duplicates, ensuring that all instances of the duplicated data are shown
print(f"total records: {len(nfl_team_data_edited)} \n")
print(f"duplicate records: {len(nfl_team_data_edited[nfl_team_data_edited.duplicated(keep="first")])} \n")
print("All instances of the duplicated data","\n", "-"*50)
display(duplicate_records)

total records: 672 

duplicate records: 0 

All instances of the duplicated data 
 --------------------------------------------------


Unnamed: 0,year,team,wins,losses,win_loss_perc,points,points_opp,points_diff,mov,g,...,rush_td,rush_yds_per_att,rush_fd,penalties,penalties_yds,pen_fd,score_pct,turnover_pct,exp_pts_tot,ties


In [11]:
# look at the unique values of the columns 
nfl_team_data_edited['ties'].unique()

<IntegerArray>
[<NA>, 0, 1]
Length: 3, dtype: Int64

In [12]:
# fix Nan values in ties 
nfl_team_data_edited['ties'] = nfl_team_data_edited['ties'].fillna(0)

In [13]:
# check the values again
nfl_team_data_edited['ties'].unique()

<IntegerArray>
[0, 1]
Length: 2, dtype: Int64

In [14]:
nfl_team_data_edited['mov'].unique()

array([  6.9,   3.1,  -2.3,  -1. ,  -2.4,  -1.7,  -4.3,  -3.4,  -7.8,
         9.5,   5. ,  -6.8,  -8. ,   5.4,   1.8,  -5.3,  -9. ,   8.4,
         3.9,  -3.9,   1.3,   0.9,   2.3,  -7.7,   7.4,   4.8,   2.9,
       -14.2,  11.1,   4.5,  -4.9,   7.6,   0.1,  -7.1,  10.7,  -1.2,
        -1.9,  -5.9,   8.3,   3. ,  -7.6,   7.9,  -2.8,  -7. ,  -1.6,
         2.8,   0.6,  -6.3,   0.2,  -3.6,   1. ,  -0.2,  -0.1,  -4.6,
       -12.1,   2.6,  -6. ,  -7.2,   4.4,   8.2,  -2.1,  12. ,   5.8,
       -10.7,   8.6,   4.9,   6.6,  -5.8,   6.8,   4.1,   1.1,   3.6,
        -5.7,  -2.9,   1.6, -10.2,  11.3,  -4.1,  -4.8, -11.8,   9.3,
        -0.7,  -1.4,   2.4,  -7.4,   4.2,   6.1,  -6.2,  11.8, -10.3,
         4.7,  -0.4,  10.8,   5.7,  -2.2,  -8.9,  -0.9,  -4.7,  19.7,
        -6.4,  -5.4, -10.6,   7.8,  -0.3,   6.7,   0.3,   8. ,  -5.6,
         8.1,   1.4,   1.5,   9. ,   3.4,  -6.1,   4. ,  -5. ,  -0.6,
        -9.7,   6.4,  -9.1, -10.9,   nan,   8.9,   7. ,  -8.1,  -3. ,
       -11.4,  -4.4,

In [15]:
nfl_team_data_edited['team'].unique()

array(['New England Patriots', 'Miami Dolphins', 'Buffalo Bills',
       'New York Jets', 'Baltimore Ravens', 'Cincinnati Bengals',
       'Pittsburgh Steelers', 'Cleveland Browns', 'Indianapolis Colts',
       'Tennessee Titans', 'Jacksonville Jaguars', 'Houston Texans',
       'Kansas City Chiefs', 'Denver Broncos', 'Oakland Raiders',
       'San Diego Chargers', 'Philadelphia Eagles', 'Dallas Cowboys',
       'Washington Redskins', 'New York Giants', 'Green Bay Packers',
       'Minnesota Vikings', 'Chicago Bears', 'Detroit Lions',
       'Carolina Panthers', 'New Orleans Saints', 'Tampa Bay Buccaneers',
       'Atlanta Falcons', 'St. Louis Rams', 'Seattle Seahawks',
       'San Francisco 49ers', 'Arizona Cardinals', 'Los Angeles Rams',
       'Los Angeles Chargers', 'Las Vegas Raiders',
       'Washington Football Team', 'Washington Commanders'], dtype=object)

In [16]:
# List of columns to drop
columns_to_drop = [
    'points', 'points_opp', 'points_diff', 'mov', 'g', 'plays_offense', 
    'yds_per_play_offense', 'fumbles_lost', 'first_down', 'pass_cmp', 
    'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'pass_net_yds_per_att', 
    'pass_fd', 'rush_att', 'rush_yds', 'rush_td', 'rush_yds_per_att', 
    'rush_fd', 'penalties', 'penalties_yds', 'pen_fd', 'score_pct', 
    'turnover_pct', 'exp_pts_tot', 'ties'
]

# Drop the columns from the dataframe
nfl_team_data_edited = nfl_team_data_edited.drop(columns=columns_to_drop, errors='ignore')

In [17]:
display(nfl_team_data_edited)

Unnamed: 0,year,team,wins,losses,win_loss_perc,total_yards,turnovers
0,2003,New England Patriots,14,2,0.875,5039,24
1,2003,Miami Dolphins,10,6,0.625,4609,34
2,2003,Buffalo Bills,6,10,0.375,4348,34
3,2003,New York Jets,6,10,0.375,4951,20
4,2003,Baltimore Ravens,10,6,0.625,4929,38
...,...,...,...,...,...,...,...
667,2023,Carolina Panthers,2,15,0.118,4510,20
668,2023,San Francisco 49ers,12,5,0.706,6773,18
669,2023,Los Angeles Rams,10,7,0.588,6108,18
670,2023,Seattle Seahawks,9,8,0.529,5490,17


In [18]:
nfl_team_data_edited.to_csv('nfl_team_data_cleaned.csv', header=True, index=False, encoding='utf-8')