# Data Science with Python

## Importing Modules

In [79]:
import pandas as pd
print(pd.__version__)

1.5.3


## Creating an Empty DataFrame

In [80]:
dframe = pd.DataFrame()
print(dframe)

Empty DataFrame
Columns: []
Index: []


## Creating and Storing Data in a DataFrame

In [81]:
df = pd.DataFrame(
    {
        'Names' : ['C#', 'Java', 'Bootstrap', 'Kotlin'],
        'Years' : [2000, 1995, 2011, 2011]
    }
)
# Accessing the second entry in the 'Names' series
df['Names'][1]

'Java'

## Creating a Series from Scratch

In [82]:
ct = pd.Series([6489,'Hey Ho',2169,True,3.15], name='count')
ct

0      6489
1    Hey Ho
2      2169
3      True
4      3.15
Name: count, dtype: object

## Creating DataFrame from Repository

In [83]:
import requests, io

url = 'https://github.com/iraqooh/JDBC-powered-ATM-Application/blob/main/data.txt'
response = requests.get(url)
if response.status_code == 200:
    data = response.text
    dframe = pd.DataFrame(io.StringIO(data))
    print(dframe.head())
else:
    print("Failed to retrieve the dataset. Status code:",
          response.status_code)

    0
0  \n
1  \n
2  \n
3  \n
4  \n


## Exercise 1

In [84]:
data = {
    'Sunday' : ['5th', '12th', '19th', '26th'],
    'Monday' : ['6th', '13th', '20th', '27th'],
    'Tuesday' : ['7th', '14th', '21st', '28th'],
    'Wednesday' : ['8th', '15th', '22nd', '29th'],
    'Thursday' : ['9th', '16th', '23rd', '30th'],
    'Friday' : ['10th', '17th', '24th', '31st'],
    'Saturday' : ['11th', '18th', '25th', ''],
}

dframe = pd.DataFrame(data)
dframe
dframe.to_csv('Exercise Python/calendar.csv', index=False)

## Exercise 2

In [85]:
subset = dframe[['Monday', 'Tuesday']]
subset

Unnamed: 0,Monday,Tuesday
0,6th,7th
1,13th,14th
2,20th,21st
3,27th,28th


## Assignment a)

In [86]:
# Reading the file
path = 'Exercise Python/'
df = pd.read_csv(path + 'mine.csv')

# Getting top 5 entries
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [87]:
# Getting last 5 entries
df.tail()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [88]:
# Getting technical information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB


In [89]:
# Calculating number of null values
df.isnull().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    5
dtype: int64

In [90]:
# Removing entries with null values and returning a new copy of the dataframe
df_filtered = df.dropna(inplace=False)

# Checking for null values
df_filtered.isnull().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [91]:
# Getting average or mean values for the calories series
avg_calories = df.describe()['Calories']['mean']
# or
# avg_caloriee = df['Calories'].mean()
avg_calories

375.79024390243904

In [92]:
# Replacing null values in the 'Calories' column with the average
df['Calories'].fillna(avg_calories, inplace=True)

# Checking for null values
df.isnull().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [93]:
# Getting data type of each series
df.dtypes

Duration      int64
Pulse         int64
Maxpulse      int64
Calories    float64
dtype: object

In [94]:
# Checking for duplicates in the data
df.duplicated().sum()

7

In [95]:
# Removing the duplicates while keeping only the first occurrence
# Within the same dataframe
df.drop_duplicates(inplace=True)

# Checking for duplicates after removing them
df.duplicated().sum()

0

In [96]:
# Creating a correlation matrix 
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.162098,0.003578,0.921907
Pulse,-0.162098,1.0,0.787035,0.015408
Maxpulse,0.003578,0.787035,1.0,0.194031
Calories,0.921907,0.015408,0.194031,1.0


In [97]:
# Writing cleaned data to file
df.to_csv(path + 'mine_clean.csv', index=False)

# Displaying cleaned data
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


## Assignment b)

In [98]:
# Reading the file
df = pd.read_csv(path + 'Work.csv')

# Getting top 5 entries
df.head()

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23.0,90
1,Wilber,Mbale,26.0,75
2,Robin,Gulu,25.0,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20.0,94


In [99]:
# Getting the shape of the data
df.shape

(7, 4)

In [100]:
# Getting last 5 entries
df.tail()

Unnamed: 0,Name,city,age,py-score
2,Robin,Gulu,25,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,Train,
6,Jesca,Mbarara,21,84


In [101]:
# Getting technical information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      7 non-null      object
 1   city      7 non-null      object
 2   age       6 non-null      object
 3   py-score  6 non-null      object
dtypes: object(4)
memory usage: 352.0+ bytes


In [102]:
# Calculating number of null values
df.isnull().sum()

Name        0
city        0
age         1
py-score    1
dtype: int64

In [103]:
# Removing entries with null values and returning a new copy of the dataframe
df_filtered = df.dropna(inplace=False)

# Checking for null values
df_filtered.isnull().sum()

Name        0
city        0
age         0
py-score    0
dtype: int64

In [104]:
# Since the dataset has only 7 entries, 
# I wouldn't remove those with null values in some columns
# Instead, I will replace them with a default value
# First, ensure all values in the 'age' column are integers
df['age'] = pd.to_numeric(df['age'], errors='coerce')

# Checking the new data type of the 'age' column
df.dtypes

Name         object
city         object
age         float64
py-score     object
dtype: object

In [105]:
# Getting the average age
avg_age = df['age'].describe()['mean']
avg_age

23.0

In [106]:
# Getting number of null values in the 'age' column
df['age'].isna().sum()

2

In [107]:
# Replacing the null values with the average value in the 'age' column
df['age'] = df['age'].fillna(avg_age)

# Checking for null values in the edited dataframe
df.isna().sum()

Name        0
city        0
age         0
py-score    1
dtype: int64

In [108]:
# Ensuring all values in the 'py-score' column are integers
df['py-score'] = pd.to_numeric(df['py-score'], errors='coerce')

# Checking the new data type of the 'age' column
df.dtypes

Name         object
city         object
age         float64
py-score    float64
dtype: object

In [109]:
# Getting the average age
avg_py_score = df['py-score'].describe()['mean']
avg_py_score

86.4

In [110]:
# Checking for null values in the 'py-score' column
df['py-score'].isna().sum()

2

In [111]:
# Replacing the null values with the average value in the 'py-score' column
df['py-score'] = df['py-score'].fillna(avg_age)

# Checking for null values in the edited dataframe
df.isna().sum()

Name        0
city        0
age         0
py-score    0
dtype: int64

In [112]:
# Checking for duplicates in the data
df.duplicated().sum()

0

In [113]:
# Creating a correlation matrix between the 'age' and 'py-score' columns
corr_matrix = df[['age', 'py-score']].corr()
corr_matrix

Unnamed: 0,age,py-score
age,1.0,-0.454894
py-score,-0.454894,1.0


In [114]:
# Writing cleaned data to file
df.to_csv(path + 'Work_clean.csv', index=False)

# Displaying cleaned data
df

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23.0,90.0
1,Wilber,Mbale,26.0,75.0
2,Robin,Gulu,25.0,23.0
3,Tevor,Livingstone,23.0,89.0
4,Yeko,Tororo,20.0,94.0
5,Miriam,Arua,23.0,23.0
6,Jesca,Mbarara,21.0,84.0
