Data Wrangling

** IMPORTANT NOTE: Most data wrangling steps had to be downloaded in separate Jupyter Notebook files due to the large data sizes.  If working with larger memory/faster processing times, this will be done in one single notebook. **

This notebook combines 2023 NFL Game day data with the universal IATA Airport Code. This code will be used for merging with other data sources.

In [37]:
import pandas as pd


In [38]:
# def convert_excel_to_csv(excel_file):
#     # Specify the output path
#     output_path = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\extra_fun_data\2023_nfl_schedule.csv'
    
#     # Read the Excel file
#     df = pd.read_excel(excel_file)
    
#     # Save to CSV in the specified folder
#     df.to_csv(output_path, index=False)
#     print(f"Converted and saved as: {output_path}")

# # Example usage
# convert_excel_to_csv(r"C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\extra_fun_data\2023_NFL_schedule.xlsx")

In [39]:
# load csv file
nfl_2023 = pd.read_csv(r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\extra_fun_data\2023_nfl_schedule.csv')

In [40]:
# explore data set

print(nfl_2023.head())

               Teams 2023-09-10 00:00:00 2023-09-17 00:00:00  \
0  Arizona Cardinals        @ Commanders          vs. Giants   
1    Atlanta Falcons        vs. Panthers         vs. Packers   
2   Baltimore Ravens          vs. Texans           @ Bengals   
3      Buffalo Bills        @ Jets (MNF)          vs Raiders   
4  Carolina Panthers           @ Falcons    vs. Saints (MNF)   

  2023-09-24 00:00:00 2023-10-01 00:00:00   2023-10-08 00:00:00  \
0         vs. Cowboys             @ 49ers           vs. Bengals   
1             @ Lions  @ Jaguars (London)            vs. Texans   
2           vs. Colts            @ Browns            @ Steelers   
3        @ Commanders        vs. Dolphins  vs. Jaguars (London)   
4          @ Seahawks         vs. Vikings               @ Lions   

  2023-10-15 00:00:00 2023-10-22 00:00:00   2023-10-29 00:00:00  \
0              @ Rams          @ Seahawks            vs. Ravens   
1      vs. Commanders        @ Buccaneers              @ Titans   
2   @ Titan

In [41]:
nfl_2023.columns = nfl_2023.columns.str.lower()

# Convert all string values in the DataFrame to lowercase
nfl_2023 = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Check the updated DataFrame
print(nfl_2023.head())

               teams 2023-09-10 00:00:00 2023-09-17 00:00:00  \
0  arizona cardinals        @ commanders          vs. giants   
1    atlanta falcons        vs. panthers         vs. packers   
2   baltimore ravens          vs. texans           @ bengals   
3      buffalo bills        @ jets (mnf)          vs raiders   
4  carolina panthers           @ falcons    vs. saints (mnf)   

  2023-09-24 00:00:00 2023-10-01 00:00:00   2023-10-08 00:00:00  \
0         vs. cowboys             @ 49ers           vs. bengals   
1             @ lions  @ jaguars (london)            vs. texans   
2           vs. colts            @ browns            @ steelers   
3        @ commanders        vs. dolphins  vs. jaguars (london)   
4          @ seahawks         vs. vikings               @ lions   

  2023-10-15 00:00:00 2023-10-22 00:00:00   2023-10-29 00:00:00  \
0              @ rams          @ seahawks            vs. ravens   
1      vs. commanders        @ buccaneers              @ titans   
2   @ titan

  nfl_2023 = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [42]:
nfl_2023.shape

(32, 19)

In [43]:
nfl_2023.columns

Index(['teams', '2023-09-10 00:00:00', '2023-09-17 00:00:00',
       '2023-09-24 00:00:00', '2023-10-01 00:00:00', '2023-10-08 00:00:00',
       '2023-10-15 00:00:00', '2023-10-22 00:00:00', '2023-10-29 00:00:00',
       '2023-11-05 00:00:00', '2023-11-12 00:00:00', '2023-11-19 00:00:00',
       '2023-11-26 00:00:00', '2023-12-03 00:00:00', '2023-12-10 00:00:00',
       '2023-12-17 00:00:00', '2023-12-24 00:00:00', '2023-12-31 00:00:00',
       'jan. 6 or 7'],
      dtype='object')

In [44]:
print(nfl_2023['teams'].unique())

['arizona cardinals' 'atlanta falcons' 'baltimore ravens' 'buffalo bills'
 'carolina panthers' 'chicago bears' 'cincinnati bengals'
 'cleveland browns' 'dallas cowboys' 'denver broncos' 'detroit lions'
 'green bay packers' 'houston texans' 'indianapolis colts'
 'jacksonville jaguars' 'kansas city chiefs' 'las vegas raiders'
 'los angeles chargers' 'los angeles rams' 'miami dolphins'
 'minnesota vikings' 'new england patriots' 'new orleans saints'
 'new york giants' 'new york jets' 'philadelphia eagles'
 'pittsburgh steelers' 'san francisco 49ers' 'seattle seahawks'
 'tampa bay buccaneers' 'tennessee titans' 'washington commanders']


In [74]:
# Normalize Game Notations

# clean_game_notation checks if the game is a home or away game
# and extracts the opponent's name. returns a tuple 
# with the game result type ('home', 'away', or 'bye') and the opponent.


# Function to clean game notation and extract result type and opponent
def clean_game_notation(cell_value):
    if isinstance(cell_value, str):
        if '@' in cell_value:
            opponent = cell_value.split('@')[1].strip()  # Get opponent
            return 'away', opponent
        elif 'vs.' in cell_value:
            opponent = cell_value.split('vs.')[1].strip()  # Get opponent
            return 'home', opponent
        elif 'bye' in cell_value:
            return 'bye', None
    return None, None

# Reshape Data
cleaned_data = []

# Iterate over each team and each game
for index, row in nfl_2023.iterrows():
    team_name = row['teams']
    for date in nfl_2023.columns[1:]:
        game_result, opponent = clean_game_notation(row[date])
        cleaned_data.append({
            'teams': team_name,
            'date': date,
            'game result': game_result,
            'opponent': opponent  # Capture opponent here
        })

# Convert to DataFrame
cleaned_df = pd.DataFrame(cleaned_data)

# Handle Bye Weeks
cleaned_df['opponent'] = cleaned_df['opponent'].fillna('bye')

# Show the cleaned DataFrame
print(cleaned_df.head())


               teams                 date game result    opponent
0  arizona cardinals  2023-09-10 00:00:00        away  commanders
1  arizona cardinals  2023-09-17 00:00:00        home      giants
2  arizona cardinals  2023-09-24 00:00:00        home     cowboys
3  arizona cardinals  2023-10-01 00:00:00        away       49ers
4  arizona cardinals  2023-10-08 00:00:00        home     bengals


In [76]:
cleaned_df.columns

Index(['teams', 'date', 'game result', 'opponent'], dtype='object')

In [78]:
cleaned_df.rename(columns={'Game Result': 'Location'}, inplace=True)

In [80]:
cleaned_df.shape


(576, 4)

In [82]:
# Convert column names to lowercase


In [86]:
print(cleaned_df['opponent'].unique())

['commanders' 'giants' 'cowboys' '49ers' 'bengals' 'rams' 'seahawks'
 'ravens' 'browns' 'falcons' 'texans' 'steelers' 'bye' 'bears' 'eagles'
 'panthers' 'packers' 'lions' 'jaguars (london)' 'buccaneers' 'titans'
 'vikings' 'cardinals' 'saints' 'jets' 'colts' 'titans (london)'
 'bengals (tnf)' 'chargers (snf)' 'jaguars (snf)' '49ers (mnf)' 'dolphins'
 'jets (mnf)' 'patriots' 'buccaneers (tnf)' 'broncos (mnf)' 'chiefs'
 'chargers' 'saints (mnf)' 'bears (tnf)' 'jaguars' 'broncos'
 'commanders (tnf)' 'raiders' 'panthers (tnf)' 'vikings (mnf)'
 'rams (mnf)' 'bills (snf)' 'ravens (tnf)' 'jaguars (mnf)'
 'steelers (mnf)' 'jets (tnf)' 'chargers (mnf)' 'seahawks (tnf)' 'bills'
 'chiefs (tnf)' 'bills (mnf)' 'vikings (snf)' 'packers (tnf)'
 'raiders (mnf)' 'patriots (germany)' 'saints (tnf)' 'bengals (mnf)'
 'jets (snf)' 'broncos (tnf)' 'dolphins (germany)' 'eagles (mnf)'
 'patriots (mnf)' 'cowboys (mnf)' 'bears (snf)' 'ravens (snf)'
 'raiders (tnf)' 'patriots (snf)' 'eagles (snf)' 'titans (mnf)'

In [88]:
cleaned_df.info

<bound method DataFrame.info of                      teams                 date game result    opponent
0        arizona cardinals  2023-09-10 00:00:00        away  commanders
1        arizona cardinals  2023-09-17 00:00:00        home      giants
2        arizona cardinals  2023-09-24 00:00:00        home     cowboys
3        arizona cardinals  2023-10-01 00:00:00        away       49ers
4        arizona cardinals  2023-10-08 00:00:00        home     bengals
..                     ...                  ...         ...         ...
571  washington commanders  2023-12-10 00:00:00         bye         bye
572  washington commanders  2023-12-17 00:00:00        away        rams
573  washington commanders  2023-12-24 00:00:00        away        jets
574  washington commanders  2023-12-31 00:00:00        home       49ers
575  washington commanders          jan. 6 or 7        home     cowboys

[576 rows x 4 columns]>

In [128]:
# create a mapping of teams to IATA codes
iata_mapping = {
    'Arizona Cardinals': 'PHX',
    'Atlanta Falcons': 'ATL',
    'Baltimore Ravens': 'BWI',
    'Buffalo Bills': 'BUF',
    'Carolina Panthers': 'CLT',
    'Chicago Bears': 'ORD',
    'Cincinnati Bengals': 'CVG',
    'Cleveland Browns': 'CLE',
    'Dallas Cowboys': 'DFW',
    'Denver Broncos': 'DEN',
    'Detroit Lions': 'DTW',
    'Green Bay Packers': 'GRB',
    'Houston Texans': 'IAH',
    'Indianapolis Colts': 'IND',
    'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'MCI',
    'Las Vegas Raiders': 'LAS',
    'Los Angeles Chargers': 'LAX',
    'Los Angeles Rams': 'LAX',
    'Miami Dolphins': 'MIA',
    'Minnesota Vikings': 'MSP',
    'New England Patriots': 'BOS',
    'New Orleans Saints': 'MSY',
    'New York Giants': 'EWR',
    'New York Jets': 'EWR',
    'Philadelphia Eagles': 'PHL',
    'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SFO',
    'Seattle Seahawks': 'SEA',
    'Tampa Bay Buccaneers': 'TPA',
    'Tennessee Titans': 'BNA',
    'Washington Commanders': 'DCA',
}

In [130]:
# Function to clean opponent name
def clean_opponent_name(opponent):
    # Remove any additional text (e.g., '(MNF)', '(Germany)', etc.)
    if '(' in opponent:
        opponent = opponent.split('(')[0].strip()  # Take the part before '('
    return opponent

In [132]:
cleaned_df.columns

Index(['teams', 'date', 'game result', 'opponent'], dtype='object')

In [134]:
print(cleaned_df.head())

               teams                 date game result    opponent
0  arizona cardinals  2023-09-10 00:00:00        away  commanders
1  arizona cardinals  2023-09-17 00:00:00        home      giants
2  arizona cardinals  2023-09-24 00:00:00        home     cowboys
3  arizona cardinals  2023-10-01 00:00:00        away       49ers
4  arizona cardinals  2023-10-08 00:00:00        home     bengals


In [136]:
# Get unique opponent names from the DataFrame
unique_opponents = cleaned_df['opponent'].unique()
print(unique_opponents)

def clean_opponent_name(opponent):
    # Remove any text in parentheses and extra spaces
    opponent = opponent.split('(')[0].strip()
    if opponent.endswith('s'):
        opponent = opponent[:-1]
    return opponent

def get_iata(row):
    if row['location'] == 'home':
        return iata_mapping.get(row['team'], None)
    elif row['location'] == 'away':
        opponent = clean_opponent_name(row['opponent'])
        return iata_mapping.get(opponent, None)
    return None

# Apply the function to get IATA codes again
cleaned_df['iata code'] = cleaned_df.apply(get_iata, axis=1)



['commanders' 'giants' 'cowboys' '49ers' 'bengals' 'rams' 'seahawks'
 'ravens' 'browns' 'falcons' 'texans' 'steelers' 'bye' 'bears' 'eagles'
 'panthers' 'packers' 'lions' 'jaguars (london)' 'buccaneers' 'titans'
 'vikings' 'cardinals' 'saints' 'jets' 'colts' 'titans (london)'
 'bengals (tnf)' 'chargers (snf)' 'jaguars (snf)' '49ers (mnf)' 'dolphins'
 'jets (mnf)' 'patriots' 'buccaneers (tnf)' 'broncos (mnf)' 'chiefs'
 'chargers' 'saints (mnf)' 'bears (tnf)' 'jaguars' 'broncos'
 'commanders (tnf)' 'raiders' 'panthers (tnf)' 'vikings (mnf)'
 'rams (mnf)' 'bills (snf)' 'ravens (tnf)' 'jaguars (mnf)'
 'steelers (mnf)' 'jets (tnf)' 'chargers (mnf)' 'seahawks (tnf)' 'bills'
 'chiefs (tnf)' 'bills (mnf)' 'vikings (snf)' 'packers (tnf)'
 'raiders (mnf)' 'patriots (germany)' 'saints (tnf)' 'bengals (mnf)'
 'jets (snf)' 'broncos (tnf)' 'dolphins (germany)' 'eagles (mnf)'
 'patriots (mnf)' 'cowboys (mnf)' 'bears (snf)' 'ravens (snf)'
 'raiders (tnf)' 'patriots (snf)' 'eagles (snf)' 'titans (mnf)'

KeyError: 'location'

In [None]:
# Filter the DataFrame to show only rows where 'IATA Code' is NaN
nan_iata_df = cleaned_df[cleaned_df['IATA Code'].isna()]

# Display the resulting DataFrame
print(nan_iata_df)


In [139]:
print(cleaned_df.head())

               teams                 date game result    opponent
0  arizona cardinals  2023-09-10 00:00:00        away  commanders
1  arizona cardinals  2023-09-17 00:00:00        home      giants
2  arizona cardinals  2023-09-24 00:00:00        home     cowboys
3  arizona cardinals  2023-10-01 00:00:00        away       49ers
4  arizona cardinals  2023-10-08 00:00:00        home     bengals


In [149]:


# Function to check for match and create opponent_team column
def find_opponent_team(row):
    opponent_prefix = row['opponent'][:4]
    for team in cleaned_df['teams']:
        if opponent_prefix in team:
            return team
    return None

# Apply the function to create the opponent_team column
cleaned_df['opponent_team'] = cleaned_df.apply(find_opponent_team, axis=1)

# Display the updated DataFrame
print(cleaned_df)

                     teams                 date game result    opponent  \
0        arizona cardinals  2023-09-10 00:00:00        away  commanders   
1        arizona cardinals  2023-09-17 00:00:00        home      giants   
2        arizona cardinals  2023-09-24 00:00:00        home     cowboys   
3        arizona cardinals  2023-10-01 00:00:00        away       49ers   
4        arizona cardinals  2023-10-08 00:00:00        home     bengals   
..                     ...                  ...         ...         ...   
571  washington commanders  2023-12-10 00:00:00         bye         bye   
572  washington commanders  2023-12-17 00:00:00        away        rams   
573  washington commanders  2023-12-24 00:00:00        away        jets   
574  washington commanders  2023-12-31 00:00:00        home       49ers   
575  washington commanders          jan. 6 or 7        home     cowboys   

             opponent_team  
0    washington commanders  
1          new york giants  
2           

In [151]:
# Filter the DataFrame to show only rows where 'IATA Code' is NaN
nan_iata_df = cleaned_df[cleaned_df['IATA Code'].isna()]

# Display the resulting DataFrame
print(nan_iata_df)

KeyError: 'IATA Code'

In [None]:
# Update the IATA Code assignment function
def get_iata(row):
    if row['Location'] == 'home':
        return iata_mapping.get(row['Team'], None)
    elif row['Location'] == 'away':
        # Clean opponent name if necessary
        opponent = clean_opponent_name(row['Opponent'])
        return iata_mapping.get(opponent, None)  # Handle if opponent not found
    return None

cleaned_df['iata_code'] = cleaned_df.apply(get_iata, axis=1)

In [None]:
# Replace the 'Opponent' column with the 'opponent_team' column
cleaned_df['Opponent'] = cleaned_df['opponent_team']

# Optionally, drop the 'opponent_team' column if you don't need it anymore
cleaned_df.drop(columns=['opponent_team'], inplace=True)

# Display the updated DataFrame
print(cleaned_df.columns)

In [None]:
cleaned_df.columns


In [None]:
print(cleaned_df.head())

In [None]:
# Mapping of teams to their IATA codes
iata_codes = {
    'arizona cardinals': 'PHX',
    'atlanta falcons': 'ATL',
    'baltimore ravens': 'BWI',
    'buffalo bills': 'BUF',
    'carolina panthers': 'CLT',
    'chicago bears': 'ORD',
    'cincinnati bengals': 'CVG',
    'cleveland browns': 'CLE',
    'dallas cowboys': 'DFW',
    'denver broncos': 'DEN',
    'detroit lions': 'DTW',
    'green bay packers': 'GRB',
    'houston texans': 'IAH',
    'indianapolis colts': 'IND',
    'jacksonville jaguars': 'JAX',
    'kansas city chiefs': 'MCI',
    'las vegas raiders': 'LAS',
    'los angeles chargers': 'LAX',
    'los angeles rams': 'LAX',
    'miami dolphins': 'MIA',
    'minnesota vikings': 'MSP',
    'new england patriots': 'BOS',
    'new orleans saints': 'MSY',
    'new york giants': 'JFK',
    'new york jets': 'JFK',
    'philadelphia eagles': 'PHL',
    'pittsburgh steelers': 'PIT',
    'san francisco 49ers': 'SFO',
    'seattle seahawks': 'SEA',
    'tampa bay buccaneers': 'TPA',
    'tennessee titans': 'BNA',
    'washington commanders': 'DCA'
}

# Function to determine IATA code based on location
def get_iata_code(row):
    if row['Location'] == 'home':
        return iata_codes[row['Team']]
    elif row['Location'] == 'away':
        opponent = row['Opponent'].lower()
        return iata_codes.get(opponent, None)  # Use .get to avoid KeyError

# Apply the function to create the IATA Code column
cleaned_df['IATA Code'] = cleaned_df.apply(get_iata_code, axis=1)

# Display the updated DataFrame
print(cleaned_df)

In [None]:
# Check for NaN values in critical columns like 'Opponent' and 'IATA Code'.

In [None]:
# Ensure all team names in the 'Team' and 'Opponent' columns are consistently formatted (lowercase, no extra spaces).
# Convert the 'Date' column to a datetime format for easier manipulation and analysis.
# Game Week: Create a column to represent the week of the season for each game (e.g., Week 1, Week 2).


# Standardize string columns
for col in ['Team', 'Opponent', 'Location']:
    cleaned_df[col] = cleaned_df[col].str.lower().str.strip()


In [None]:
# Clean the 'Date' column: replace problematic formats
cleaned_df['Date'] = cleaned_df['Date'].str.replace('Jan. 6 or 7', '2023-01-06', regex=False)

# Convert Date to datetime format with error handling
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'], errors='coerce')

print(cleaned_df)


In [None]:
print(cleaned_df.head())

In [None]:
# Identify rows where the Date conversion resulted in NaT
invalid_date_rows = cleaned_df[cleaned_df['Date'].isna()]

# Display rows with invalid dates
print(invalid_date_rows)

In [None]:

# Create a Game Week column
cleaned_df['Game Week'] = cleaned_df['Date'].dt.isocalendar().week

# Final inspection


In [None]:
print(cleaned_df['Date'].unique())

In [None]:
# Filter the DataFrame to find the row with the problematic date
problematic_game = cleaned_df[cleaned_df['Date'].astype(str).str.contains('Jan. 6 or 7')]
print(problematic_game)

In [None]:
# jan 7 games
teams_to_update = ['buffalo bills', 'miami dolphins', 'kansas city chiefs',
                   'los angeles chargers', 'los angeles rams', 'san francisco 49ers',
                   'seattle seahawks', 'arizona cardinals', 'philadelphia eagles',
                   'new york giants', 'denver broncos', 'las vegas raiders',
                   'dallas cowboys', 'washington commanders', 'chicago bears',
                   'green bay packers', 'cleveland browns', 'cincinnati bengals',
                   'tampa bay buccaneers', 'carolina panthers', 'new york jets',
                   'new england patriots', 'minnesota vikings', 'detroit lions',
                   'jacksonville jaguars']                   

# Update the Date column for the specified teams
cleaned_df.loc[
    (cleaned_df['Team'].isin(teams_to_update)) & 
    (cleaned_df['Date'].dt.strftime('%b. %d').isin(['Jan. 6', 'Jan. 7'])),
    'Date'
] = '2024-01-07'

# Convert the Date column to datetime format
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'], errors='coerce')

# Final inspection
print(cleaned_df[cleaned_df['Date'].isna()])  # Check for any remaining NaT values
print(cleaned_df.head()) 

In [None]:
# jan 6 games
teams_to_update = [
    'pittsburgh steelers',
    'baltimore ravens',
    'indianapolis colts',
    'houston texans'
]

# Update the Date column for the specified teams
cleaned_df.loc[cleaned_df['Team'].isin(teams_to_update), 'Date'] = '2024-01-06'

# Convert the Date column to datetime format
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'], errors='coerce')

# Final inspection
print(cleaned_df[cleaned_df['Date'].isna()])  # Check for any remaining NaT values
print(cleaned_df.head())  # View updated DataFrame

In [None]:
# Get unique values in the 'Date' column
unique_dates = cleaned_df['Date'].unique()

# Print the unique dates
print(unique_dates)

In [None]:
# Filter for rows where the Date column is NaT
nat_rows = cleaned_df[cleaned_df['Date'].isna()]

# Display the rows with NaT in the Date column
print(nat_rows)

In [None]:
# Create a dictionary to map teams to their game dates
saints_game_dates = {
    'tennessee titans': '2023-09-10',
    'carolina panthers': '2023-09-17',
    'green bay packers': '2023-09-24',
    'tampa bay buccaneers': '2023-10-01',
    'new england patriots': '2023-10-08',
    'houston texans': '2023-10-15',
    'jacksonville jaguars': '2023-10-22',
    'indianapolis colts': '2023-10-29',
    'chicago bears': '2023-11-05',
    'minnesota vikings': '2023-11-12',
    'atlanta falcons': '2023-11-19',
    'tampa bay buccaneers': '2023-11-26',
    'los angeles rams': '2023-12-03',
    'new york giants': '2023-12-10',
    'atlanta falcons': '2023-12-17',
    'philadelphia eagles': '2023-12-24',
    'miami dolphins': '2023-12-31'

}

# Update the Date column based on the Team column
for team, date in saints_game_dates.items():
    cleaned_df.loc[cleaned_df['Opponent'] == team, 'Date'] = pd.to_datetime(date)

# Final inspection to see if updates were successful
print(cleaned_df[cleaned_df['Team'] == 'new orleans saints'])


In [None]:
# List of Tennessee Titans games and their dates
titans_games = {
    'new orleans saints': '2023-09-10',
    'los angeles chargers': '2023-09-17',
    'cleveland browns': '2023-09-24',
    'indianapolis colts': '2023-10-01',
    'baltimore ravens': '2023-10-08',
    'atlanta falcons': '2023-10-15',
    'cincinnati bengals': '2023-10-22',
    'jacksonville jaguars': '2023-10-29',
    'tampa bay buccaneers': '2023-11-05',
    'carolina panthers': '2023-11-12',
    'miami dolphins': '2023-11-19',
    'houston texans': '2023-11-26',
    'indianapolis colts': '2023-12-03',
    'new york jets': '2023-12-10',
    'houston texans': '2023-12-17',
    'seattle seahawks': '2023-12-24',
    'jacksonville jaguars': '2023-12-31',
    'new england patriots': '2024-01-07'
}

# Update the 'Date' column in cleaned_df
for opponent, date in titans_games.items():
    cleaned_df.loc[cleaned_df['Opponent'] == opponent, 'Date'] = pd.to_datetime(date)

# Verify the changes
print(cleaned_df[cleaned_df['Team'] == 'tennessee titans'])

In [None]:
cleaned_df.drop(columns=['iata_code'], inplace=True)

In [None]:
duplicates = cleaned_df.duplicated()
print("Duplicate rows:")
print(cleaned_df[duplicates])

In [None]:
#drop duplicates
cleaned_df = cleaned_df.drop_duplicates()


In [None]:
null_values = cleaned_df.isnull().sum()
print("Null values in each column:")
print(null_values)

In [None]:
# Display rows where 'Date' is null
null_date_rows = cleaned_df[cleaned_df['Date'].isnull()]
print("Rows with null values in the Date column:")
print(null_date_rows)

In [None]:
# Display rows where 'Date' is null
null_date_rows = cleaned_df[cleaned_df['Date'].isnull()]
print("Rows with null values in the Date column:")
print(null_date_rows)

In [None]:
# handle bye weeks for Opponent
cleaned_df.loc[cleaned_df['Opponent'].isnull() & cleaned_df['Location'] == 'bye', 'Opponent'] = 'bye'

In [None]:
# Check the updated DataFrame
print("Updated DataFrame:")
print(cleaned_df[cleaned_df['Opponent'].isnull() | cleaned_df['Date'].isnull()])

In [None]:
# Display rows where the Team is 'tennessee titans'
titans_games = cleaned_df[cleaned_df['Team'] == 'tennessee titans']
print(titans_games)

In [None]:
# Remove rows where 'Opponent' is 'none' or 'Date' is NaT
cleaned_df = cleaned_df[~((cleaned_df['Opponent'] == 'none') | (cleaned_df['Date'].isna()))]

print(cleaned_df)

In [None]:
# Check for duplicates
duplicates = cleaned_df.duplicated()
duplicate_count = duplicates.sum()

# Check for missing values
missing_values = cleaned_df.isna().sum()

print("Duplicate Rows:")
print(cleaned_df[duplicates])

print("\nTotal Duplicates:", duplicate_count)

print("\nMissing Values in Each Column:")
print(missing_values)

In [None]:
# Remove rows where Location is 'bye'
cleaned_df = cleaned_df[cleaned_df['Location'] != 'bye']

# Check the shape of the DataFrame after the operation
print(cleaned_df.shape)

In [None]:
# Check for missing values
missing_values = cleaned_df.isna().sum()

print("\nMissing Values in Each Column:")
print(missing_values)

In [None]:
missing_values_df = cleaned_df[cleaned_df.isnull().any(axis=1)]
print(missing_values_df)

In [None]:
# Filter the DataFrame
cleaned_df = cleaned_df[~(cleaned_df['Location'].isin(['None', 'bye']) | cleaned_df['Date'].isna())]

# Final inspection
print(cleaned_df.head())

In [None]:
# Check for missing values
missing_values = cleaned_df.isna().sum()

print("\nMissing Values in Each Column:")
print(missing_values)

In [None]:
# Filter the DataFrame for rows where Location, Opponent, and IATA Code are all missing
missing_rows = cleaned_df[cleaned_df[['Location', 'Opponent', 'IATA Code']].isna().all(axis=1)]

# Display the rows
print(missing_rows)


In [None]:
# Define the indices of the rows to drop
indices_to_drop = [55, 352, 480]

# Drop the specified rows from the DataFrame
cleaned_df = cleaned_df.drop(index=indices_to_drop)

# Resetting the index (optional)
cleaned_df.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
print(cleaned_df)

In [None]:
# Check for missing values
missing_values = cleaned_df.isna().sum()

print("\nMissing Values in Each Column:")
print(missing_values)

In [None]:
# Summary statistics
print(cleaned_df.describe())

In [None]:
# Specify the path where you want to save the file
file_path = r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\nfl_iata.csv'

# Save the DataFrame as a CSV file
cleaned_df.to_csv(file_path, index=False)

In [163]:
# Load .csv file to review
nfl_2023 = pd.read_csv(r"C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\nfl_iata.csv")

In [155]:
nfl_2023.head()

Unnamed: 0,Team,Date,Location,Opponent,IATA Code,Game Week
0,arizona cardinals,2023-09-10,away,washington commanders,DCA,36.0
1,arizona cardinals,2023-12-10,home,new york giants,PHX,37.0
2,arizona cardinals,2023-09-24,home,dallas cowboys,PHX,38.0
3,arizona cardinals,2023-10-01,away,san francisco 49ers,SFO,39.0
4,arizona cardinals,2023-10-22,home,cincinnati bengals,PHX,40.0


In [157]:
# Convert column names to lowercase
nfl_2023.columns = nfl_2023.columns.str.lower()

# Convert all string values in the DataFrame to lowercase
nflnfl_2023_df = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Check the updated DataFrame
print(nfl_2023.head())


                team        date location               opponent iata code  \
0  arizona cardinals  2023-09-10     away  washington commanders       DCA   
1  arizona cardinals  2023-12-10     home        new york giants       PHX   
2  arizona cardinals  2023-09-24     home         dallas cowboys       PHX   
3  arizona cardinals  2023-10-01     away    san francisco 49ers       SFO   
4  arizona cardinals  2023-10-22     home     cincinnati bengals       PHX   

   game week  
0       36.0  
1       37.0  
2       38.0  
3       39.0  
4       40.0  


  nflnfl_2023_df = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [161]:
# Convert column names to lowercase
nfl_2023.columns = nfl_2023.columns.str.lower()

# Convert all string values in the DataFrame to lowercase
nfl_2023 = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Check the updated DataFrame
print(nfl_2023.head())


                team        date location               opponent iata code  \
0  arizona cardinals  2023-09-10     away  washington commanders       dca   
1  arizona cardinals  2023-12-10     home        new york giants       phx   
2  arizona cardinals  2023-09-24     home         dallas cowboys       phx   
3  arizona cardinals  2023-10-01     away    san francisco 49ers       sfo   
4  arizona cardinals  2023-10-22     home     cincinnati bengals       phx   

   game week  
0       36.0  
1       37.0  
2       38.0  
3       39.0  
4       40.0  


  nfl_2023 = nfl_2023.applymap(lambda x: x.lower() if isinstance(x, str) else x)
