# Runners And Income Data Analysis

In [1]:
# Import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset from csv file

df = pd.read_csv("../DATA/runners_and_income_data.csv")

# First two rows of data
df.head(2)

Unnamed: 0,Name,Miles Run,Time in hrs,Age,Income,Gender
0,Joe,45.0,40.0,23.0,50000.0,male
1,Phil,,38.0,44.0,,male


In [3]:
# Check number of rows and columns
shape = df.shape
print(f"The number of rows is {shape[0]}")
print(f"The number of columns is {shape[1]}")

The number of rows is 23
The number of columns is 6


In [4]:
# Number of NAN values

nan_count = df.isna().sum().sum()
print(f"The number of NAN values is {nan_count}")

The number of NAN values is 50


In [5]:
# Drop rows with NA values

df.dropna(axis=0,how='all',inplace=True)
df.head()

Unnamed: 0,Name,Miles Run,Time in hrs,Age,Income,Gender
0,Joe,45.0,40.0,23.0,50000.0,male
1,Phil,,38.0,44.0,,male
2,Ken,63.0,,56.0,60000.0,female
3,Jos,36.0,50.0,,75000.0,male
4,Luke,43.0,50.0,34.0,,female


In [6]:
# Replace with NA values with 0.00
df.fillna(value=0.00, inplace=True)
df.isna().sum().sum()

0

In [7]:
# Mean of Miles Run column
mean_miles_run = df['Miles Run'].describe().loc['mean'].round(2)
print(f"The mean of the miles run column is {mean_miles_run}")

The mean of the miles run column is 29.47


In [8]:
# OR

mean_miles_run = df.describe()['Miles Run']['mean'].round(2)
print(f"The mean of the miles run column is {mean_miles_run}")


The mean of the miles run column is 29.47


In [9]:
# Total miles run

total_miles_run = np.sum(df['Miles Run'])
print(f"The total miles run is {total_miles_run:.2f}")

The total miles run is 501.00


In [10]:
# People with income of $50000
names = list(df.loc[df['Income']==50000,'Name'])
print(names)

['Joe', 'Teddy', 'Ira']


In [11]:
# Subset of dataset of last two rows

df_last2_rows = df.tail(2)

#Reset index for data
df_last2_rows.reset_index(drop=True,inplace=True)
df_last2_rows

Unnamed: 0,Name,Miles Run,Time in hrs,Age,Income,Gender
0,Barack,35.0,40.0,62.0,33000.0,male
1,Vladamir,44.0,38.0,64.0,53000.0,male


In [12]:
# Save data to csv file
df_last2_rows.to_csv("../DATA/data/runners_data_modified.csv", index=False)