# Pandas methods testing

### Creating, loading, and saving of data

In [472]:
# Creation
people = {
    'first' : ['ginger', 'baddie', 'mickey', 'meggy'],
    'last' : ['bread', 'baldie', 'mann', 'muffin'],
    'email' : ['gingerbread@email.com', 'baddiebaldie@email.com', 'mickeymann@email.com', 'meggymuffin@email.com'],
    'sex' : ['male', 'female', 'male', 'female'],
    'age' : [18, 27, 33, 15],
}

In [473]:
# Loading
import pandas as pd
import numpy as np

people_dataframe = pd.DataFrame(people)
people_dataframe

Unnamed: 0,first,last,email,sex,age
0,ginger,bread,gingerbread@email.com,male,18
1,baddie,baldie,baddiebaldie@email.com,female,27
2,mickey,mann,mickeymann@email.com,male,33
3,meggy,muffin,meggymuffin@email.com,female,15


In [474]:
# Saving to csv
people_dataframe.to_csv('data/people_dataframe.csv')

In [475]:
# Loading from csv file
people_dataframe = pd.read_csv(r'data\people_dataframe.csv', index_col=0)
people_dataframe

Unnamed: 0,first,last,email,sex,age
0,ginger,bread,gingerbread@email.com,male,18
1,baddie,baldie,baddiebaldie@email.com,female,27
2,mickey,mann,mickeymann@email.com,male,33
3,meggy,muffin,meggymuffin@email.com,female,15


### Displaying of dataframe and information about it

In [476]:
# Displaying first 5 rows of dataframe
people_dataframe.head(5)

Unnamed: 0,first,last,email,sex,age
0,ginger,bread,gingerbread@email.com,male,18
1,baddie,baldie,baddiebaldie@email.com,female,27
2,mickey,mann,mickeymann@email.com,male,33
3,meggy,muffin,meggymuffin@email.com,female,15


In [477]:
# Setting display option of dataframe
pd.set_option('display.max_columns', 20)

In [478]:
# Showing numbers of rows  and columns
people_dataframe.shape

(4, 5)

In [479]:
# Showing the columns' information
people_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   4 non-null      object
 1   last    4 non-null      object
 2   email   4 non-null      object
 3   sex     4 non-null      object
 4   age     4 non-null      int64 
dtypes: int64(1), object(4)
memory usage: 192.0+ bytes


In [480]:
# Statistics about the dataframe
people_dataframe.describe()

Unnamed: 0,age
count,4.0
mean,23.25
std,8.261356
min,15.0
25%,17.25
50%,22.5
75%,28.5
max,33.0


### Selecting rows and columns

In [481]:
# Selecting columns
people_dataframe['email']

0     gingerbread@email.com
1    baddiebaldie@email.com
2      mickeymann@email.com
3     meggymuffin@email.com
Name: email, dtype: object

In [482]:
# Selecting multiple columns
people_dataframe[['first', 'last', 'email']]

Unnamed: 0,first,last,email
0,ginger,bread,gingerbread@email.com
1,baddie,baldie,baddiebaldie@email.com
2,mickey,mann,mickeymann@email.com
3,meggy,muffin,meggymuffin@email.com


In [483]:
# Selecting rows by index
people_dataframe.iloc[0]

first                   ginger
last                     bread
email    gingerbread@email.com
sex                       male
age                         18
Name: 0, dtype: object

In [484]:
# Selecting multiple rows by index
people_dataframe.iloc[[0,1], 2]

0     gingerbread@email.com
1    baddiebaldie@email.com
Name: email, dtype: object

In [485]:
# Selecting multiple rows with selected columns by index
people_dataframe.iloc[[0,1], [2,3]]

Unnamed: 0,email,sex
0,gingerbread@email.com,male
1,baddiebaldie@email.com,female


In [486]:
# Selecting multiple rows with selected columns by label
people_dataframe.loc[[0,1], ['email', 'sex']]

Unnamed: 0,email,sex
0,gingerbread@email.com,male
1,baddiebaldie@email.com,female


In [487]:
# Getting single value
people_dataframe.at[0, 'first']

'ginger'

### Filtering the data frame

In [488]:
# Searching through the data frame with given conditions
filt = (people_dataframe['age'] <= 20)
people_dataframe.loc[filt]

Unnamed: 0,first,last,email,sex,age
0,ginger,bread,gingerbread@email.com,male,18
3,meggy,muffin,meggymuffin@email.com,female,15


In [489]:
# Searching through the data frame by given conditions with selected columns
people_dataframe.loc[filt, ['first', 'last', 'age']]

Unnamed: 0,first,last,age
0,ginger,bread,18
3,meggy,muffin,15


### Set, reset, sort, and use index

In [490]:
# Showing index
people_dataframe.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [491]:
# Setting index
people_dataframe.set_index('email', inplace=True)

In [492]:
# Dataframe after index set
people_dataframe

Unnamed: 0_level_0,first,last,sex,age
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gingerbread@email.com,ginger,bread,male,18
baddiebaldie@email.com,baddie,baldie,female,27
mickeymann@email.com,mickey,mann,male,33
meggymuffin@email.com,meggy,muffin,female,15


In [493]:
# Selecting row by idex
people_dataframe.loc['mickeymann@email.com']

first    mickey
last       mann
sex        male
age          33
Name: mickeymann@email.com, dtype: object

In [494]:
# Resetting index
people_dataframe.reset_index(inplace=True)

In [495]:
# Dataframe after index reset
people_dataframe

Unnamed: 0,email,first,last,sex,age
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [496]:
# Sorting index
people_dataframe.sort_index(ascending=False)

Unnamed: 0,email,first,last,sex,age
3,meggymuffin@email.com,meggy,muffin,female,15
2,mickeymann@email.com,mickey,mann,male,33
1,baddiebaldie@email.com,baddie,baldie,female,27
0,gingerbread@email.com,ginger,bread,male,18


### Modifying data within dataframe

In [497]:
# Renaming columns
people_dataframe.columns = ['email', 'first', 'last', 'sex', 'age']
people_dataframe

Unnamed: 0,email,first,last,sex,age
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [498]:
# Renaming columns using mapping
people_dataframe.rename(columns={'first' : 'first name', 'last' : 'last name'}, inplace=True)
people_dataframe

Unnamed: 0,email,first name,last name,sex,age
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [499]:
# Modifying columns with string manipulation
people_dataframe.columns = people_dataframe.columns.str.replace(' ', '_')
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [500]:
# Modifying columns with list comprehension
people_dataframe.columns = [x.upper() for x in people_dataframe.columns]
people_dataframe

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME,SEX,AGE
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [501]:
# Modifying columns with string manipulation
people_dataframe.columns = people_dataframe.columns.str.lower()
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,gingerbread@email.com,ginger,bread,male,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [502]:
# Modifying row values by index
people_dataframe.loc[0] = ['breadginger@email.com', 'bread', 'ginger', 'female', 81]
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,breadginger@email.com,bread,ginger,female,81
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [503]:
# Modifying row values by index with selected columns
people_dataframe.loc[0, ['first_name', 'last_name']] = ['ginger', 'bread']
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,breadginger@email.com,ginger,bread,female,81
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [504]:
# Modifying single row values by index and column
people_dataframe.loc[0, 'age'] = 18
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,breadginger@email.com,ginger,bread,female,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [505]:
# Modifying row values with filter
filt = people_dataframe['first_name'] == 'mickey'
people_dataframe.loc[filt, 'first_name'] = 'mouse'
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,breadginger@email.com,ginger,bread,female,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mouse,mann,male,33
3,meggymuffin@email.com,meggy,muffin,female,15


In [506]:
# Modifying row values
people_dataframe['first_name'].str.capitalize()

0    Ginger
1    Baddie
2     Mouse
3     Meggy
Name: first_name, dtype: object

In [507]:
# Modifying row values using apply method
people_dataframe['email'].apply(lambda x: x.upper())

0     BREADGINGER@EMAIL.COM
1    BADDIEBALDIE@EMAIL.COM
2      MICKEYMANN@EMAIL.COM
3     MEGGYMUFFIN@EMAIL.COM
Name: email, dtype: object

In [508]:
# Modifying all row values using applymap method
people_dataframe.loc[:, people_dataframe.columns != 'age'].applymap(str.capitalize)

Unnamed: 0,email,first_name,last_name,sex
0,Breadginger@email.com,Ginger,Bread,Female
1,Baddiebaldie@email.com,Baddie,Baldie,Female
2,Mickeymann@email.com,Mouse,Mann,Male
3,Meggymuffin@email.com,Meggy,Muffin,Female


In [509]:
# Replacing multiple row values by column with map
people_dataframe['first_name'] = people_dataframe['first_name'].replace({'mouse' : 'mickey', 'meggy' : 'chungus'})
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age
0,breadginger@email.com,ginger,bread,female,18
1,baddiebaldie@email.com,baddie,baldie,female,27
2,mickeymann@email.com,mickey,mann,male,33
3,meggymuffin@email.com,chungus,muffin,female,15


### Adding and removing rows and columns from the data frame

In [510]:
# Combining two columns and creating another column from it
people_dataframe['full_name'] = people_dataframe['first_name'] + ' ' + people_dataframe['last_name']
people_dataframe

Unnamed: 0,email,first_name,last_name,sex,age,full_name
0,breadginger@email.com,ginger,bread,female,18,ginger bread
1,baddiebaldie@email.com,baddie,baldie,female,27,baddie baldie
2,mickeymann@email.com,mickey,mann,male,33,mickey mann
3,meggymuffin@email.com,chungus,muffin,female,15,chungus muffin


In [511]:
# Removing columns
people_dataframe.drop(columns=['first_name', 'last_name'], inplace=True)
people_dataframe

Unnamed: 0,email,sex,age,full_name
0,breadginger@email.com,female,18,ginger bread
1,baddiebaldie@email.com,female,27,baddie baldie
2,mickeymann@email.com,male,33,mickey mann
3,meggymuffin@email.com,female,15,chungus muffin


In [512]:
# Creating new columns by splitting values from other columns
people_dataframe[['first_name', 'last_name']] = people_dataframe['full_name'].str.split(expand=True)
people_dataframe

Unnamed: 0,email,sex,age,full_name,first_name,last_name
0,breadginger@email.com,female,18,ginger bread,ginger,bread
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie
2,mickeymann@email.com,male,33,mickey mann,mickey,mann
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin


In [513]:
# Adding new column and values
people_dataframe = people_dataframe.assign(has_hands=[False, True, True, False])
people_dataframe

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False


In [514]:
# Adding dataframe to another dataframe
people2 = {
    'email' : ['bigpotato@email.com'],
    'sex' : ['male'],
    'age' : [25],
    'full_name' : ['big potato'],
    'first_name' : ['big'], 
    'last_name' : ['potato'],
    'has_hands' : [False],
}
people_dataframe_2 = pd.DataFrame(people2)

people_dataframe = people_dataframe.append(people_dataframe_2, ignore_index=True)
people_dataframe

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,bigpotato@email.com,male,25,big potato,big,potato,False


In [515]:
# Adding new rows
people_dataframe.loc[len(people_dataframe.index)] = ['sammiacm@email.com', 'female', 45, 'sammi acm', 'sammi', 'acm', True]
people_dataframe

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,bigpotato@email.com,male,25,big potato,big,potato,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [516]:
# Removing rows by index with list comprehension
people_dataframe.drop(index=[x for x in range(4,people_dataframe.index.__len__())])

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False


In [517]:
# Removing rows by conditions
filt = (people_dataframe['first_name'] == 'big') | (people_dataframe['last_name'] == 'acm')
people_dataframe.drop(index=people_dataframe[filt].index)

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False


### Sorting data

In [518]:
# Sorting by single column
people_dataframe.sort_values(by=['full_name'], ascending=True)

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
4,bigpotato@email.com,male,25,big potato,big,potato,False
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [519]:
# Sorting by multiple columns
people_dataframe.sort_values(by=['first_name', 'age'], ascending=[True, False])

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
4,bigpotato@email.com,male,25,big potato,big,potato,False
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [520]:
# Single column sorting
people_dataframe['email'].sort_values(ascending=True)

1    baddiebaldie@email.com
4       bigpotato@email.com
0     breadginger@email.com
3     meggymuffin@email.com
2      mickeymann@email.com
5        sammiacm@email.com
Name: email, dtype: object

### Grouping and aggregating data

In [521]:
# Grouping data by a column and condition
sexuality_group = people_dataframe.groupby(by='sex')
sexuality_group.get_group(name='female')

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,breadginger@email.com,female,18,ginger bread,ginger,bread,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [522]:
# Aggregating data by sex and tallying the age by count
sexuality_group['age'].value_counts()

sex     age
female  15     1
        18     1
        27     1
        45     1
male    25     1
        33     1
Name: age, dtype: int64

In [523]:
# Aggregating data and tallying the age by count with a given condition
sexuality_group['age'].value_counts(normalize=True).loc['male']

age
25    0.5
33    0.5
Name: age, dtype: float64

In [524]:
# Applying a single aggregate function to a grouped data
sexuality_group['age'].median()

sex
female    22.5
male      29.0
Name: age, dtype: float64

In [525]:
# Applying multiple aggregate function to a grouped data
sexuality_group['age'].agg(['median', 'mean'])

Unnamed: 0_level_0,median,mean
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,22.5,26.25
male,29.0,29.0


In [526]:
# Grouping columns 
has_hands_group = people_dataframe.groupby(by='has_hands')
respondents_sexuality = people_dataframe['sex'].value_counts()
respondents_has_hands = has_hands_group['sex'].value_counts().get(True)
respondents_has_hands

sex
female    2
male      1
Name: sex, dtype: int64

In [527]:
# Concatenating grouped columns that has the same initial columns
age_sex_dataframe = pd.concat([respondents_sexuality, respondents_has_hands], axis='columns', keys=['sex','has_hands'])
age_sex_dataframe

Unnamed: 0,sex,has_hands
female,4,2
male,2,1


In [528]:
# Making a new column from the grouped data
age_sex_dataframe.assign(has_hands_percentage = (respondents_has_hands / respondents_sexuality) * 100)

Unnamed: 0,sex,has_hands,has_hands_percentage
female,4,2,50.0
male,2,1,50.0


### Cleaning data

In [529]:
# Setting some values to [np.nan], [None], and custom none values in string to simulate uncleaned data
people_dataframe.loc[0, ['email', 'last_name']] = [np.nan, None]
people_dataframe.loc[4, ['email', 'last_name']] = ['Missing', 'NA']
people_dataframe.loc[len(people_dataframe.index)] = [None for x in people_dataframe.columns]

In [530]:
# Dropping rows in the dataframe with NA values
people_dataframe.dropna() # this works just like people_dataframe.dropna(axis='index', how='any')

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,Missing,male,25,big potato,big,,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [531]:
# Dropping only rows with all values as none
people_dataframe.dropna(axis='index', how='all')

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,,female,18,ginger bread,ginger,,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,Missing,male,25,big potato,big,,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [532]:
# Dropping rows with the condition that as long as full name is filled in, it wont drop the row
people_dataframe.dropna(axis='index', how='any', subset=['first_name'])

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,,female,18,ginger bread,ginger,,False
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,Missing,male,25,big potato,big,,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [533]:
# Dropping rows with the condition that for a row to be dropped, both values in the subset arguments should be missing
people_dataframe.dropna(axis='index', how='all', subset=['email', 'last_name'])

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
1,baddiebaldie@email.com,female,27,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15,chungus muffin,chungus,muffin,False
4,Missing,male,25,big potato,big,,False
5,sammiacm@email.com,female,45,sammi acm,sammi,acm,True


In [550]:
# To drop the missing values that is in string, replacing the values in the whole dataframe should be done
people_dataframe.replace('NA', np.nan, inplace=True)
people_dataframe.replace('Missing', np.nan, inplace=True)
people_dataframe.dropna(inplace=True)

In [538]:
# Map which value is na
people_dataframe.isna()

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
0,True,False,False,False,False,True,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,True,False,False,False,False,True,False
5,False,False,False,False,False,False,False
6,True,True,True,True,True,True,True


In [549]:
# Replace all NA values with anything
people_dataframe.fillna(np.nan, inplace=True)

In [554]:
# Converting a column to other data type
people_dataframe['has_hands'] = people_dataframe['has_hands'].astype(bool)
people_dataframe

Unnamed: 0,email,sex,age,full_name,first_name,last_name,has_hands
1,baddiebaldie@email.com,female,27.0,baddie baldie,baddie,baldie,True
2,mickeymann@email.com,male,33.0,mickey mann,mickey,mann,True
3,meggymuffin@email.com,female,15.0,chungus muffin,chungus,muffin,False
5,sammiacm@email.com,female,45.0,sammi acm,sammi,acm,True


### Working with date time objects

In [590]:
# Reading the dataframe and parsing dates
from datetime import datetime

d_parser = lambda date: datetime.strptime(date, r'%Y-%m-%d %I-%p')
eth_dataframe = pd.read_csv('data\ETH_1h.csv', index_col=0, parse_dates=['Date'], date_parser=d_parser)

eth_dataframe.columns = eth_dataframe.columns.str.lower()
eth_dataframe

Unnamed: 0,date,symbol,open,high,low,close,volume
0,2020-03-13 20:00:00,ETHUSD,129.94,131.82,126.87,128.71,1940673.93
1,2020-03-13 19:00:00,ETHUSD,119.51,132.02,117.10,129.94,7579741.09
2,2020-03-13 18:00:00,ETHUSD,124.47,124.85,115.50,119.51,4898735.81
3,2020-03-13 17:00:00,ETHUSD,124.08,127.42,121.63,124.47,2753450.92
4,2020-03-13 16:00:00,ETHUSD,124.85,129.51,120.17,124.08,4461424.71
...,...,...,...,...,...,...,...
23669,2017-07-01 15:00:00,ETHUSD,265.74,272.74,265.00,272.57,1500282.55
23670,2017-07-01 14:00:00,ETHUSD,268.79,269.90,265.00,265.74,1702536.85
23671,2017-07-01 13:00:00,ETHUSD,274.83,274.93,265.00,268.79,3010787.99
23672,2017-07-01 12:00:00,ETHUSD,275.01,275.01,271.00,274.83,824362.87


In [580]:
# Analyzing data types of the dataframe 
eth_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23674 entries, 0 to 23673
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    23674 non-null  object 
 1   symbol  23674 non-null  object 
 2   open    23674 non-null  float64
 3   high    23674 non-null  float64
 4   low     23674 non-null  float64
 5   close   23674 non-null  float64
 6   volume  23674 non-null  float64
dtypes: float64(5), object(2)
memory usage: 1.4+ MB


In [592]:
# Converting the date column to date time object
# eth_dataframe['date'] = pd.to_datetime(eth_dataframe['date'], format=r'%Y-%m-%d %I-%p')

In [605]:
# Appending new day of the week column to the dataframe with date time method
eth_dataframe = eth_dataframe.assign(day_of_week = eth_dataframe['date'].dt.day_name())

In [609]:
# Moving the day of week column next to date
eth_dataframe = eth_dataframe[['date', 'day_of_week', 'symbol', 'open', 'high', 'low', 'close', 'volume']]

In [610]:
eth_dataframe

Unnamed: 0,date,day_of_week,symbol,open,high,low,close,volume
0,2020-03-13 20:00:00,Friday,ETHUSD,129.94,131.82,126.87,128.71,1940673.93
1,2020-03-13 19:00:00,Friday,ETHUSD,119.51,132.02,117.10,129.94,7579741.09
2,2020-03-13 18:00:00,Friday,ETHUSD,124.47,124.85,115.50,119.51,4898735.81
3,2020-03-13 17:00:00,Friday,ETHUSD,124.08,127.42,121.63,124.47,2753450.92
4,2020-03-13 16:00:00,Friday,ETHUSD,124.85,129.51,120.17,124.08,4461424.71
...,...,...,...,...,...,...,...,...
23669,2017-07-01 15:00:00,Saturday,ETHUSD,265.74,272.74,265.00,272.57,1500282.55
23670,2017-07-01 14:00:00,Saturday,ETHUSD,268.79,269.90,265.00,265.74,1702536.85
23671,2017-07-01 13:00:00,Saturday,ETHUSD,274.83,274.93,265.00,268.79,3010787.99
23672,2017-07-01 12:00:00,Saturday,ETHUSD,275.01,275.01,271.00,274.83,824362.87
