
Welcome to Pandas for Data Science

Todays agenda:

    Reading Data in Pandas
    Sorting/Describing Data
    For Loops with Pandas Dataframes
    Filtering Data
    Making Changes to the Data
    


DATA ACCESS WITH PANDAS

In [None]:
import pandas as pd

print ('pandas version', pd.__version__)

# Use this data, GDP for countries in Europe
data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/gapminder_gdp_europe.csv')

# Colab
store = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/cal_housing_small.csv')

# Ctrl + Shift + Enter = Specific Line Run
# Ctrl + Enter = All Lines Run

store.head(10)

data.head(10)

# Print your data
print(data)

# The display statement formats the output better than the print
display(data)

# (SORTING DATA)
# ascending should be True by default
data.sort_values("country")
data.sort_values(["country", "gdpPercap_1952"]) # In sorting, the row takes precedence over the column
# country is ascending, gdpPercap_1952 is descending
# Set the argument for ascending using a Boolean True/False
# To sort in descending order, do this:
data.sort_values(["country", "gdpPercap_1952"], ascending=False)

data.sort_values(["country", "gdpPercap_1952"], ascending=[1,0])

# country is not ascending, gdpPercap_1952 is ascending
data.sort_values(["gdpPercap_1952"], ascending=[1])
# By default, ascending is True, gdpPercap_1952 is ascending
data.sort_values(["gdpPercap_1952"])

# To access random data e.g. 5 samples, you can do:
data.sample(5)


FOR LOOPS WITH DATAFRAMES

In [None]:
import pandas as pd

print ('pandas version', pd.__version__)

# Use this data, GDP for countries in Europe
data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/gapminder_gdp_europe.csv')

# Colab
store = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/cal_housing_small.csv')

# FOR LOOPS WITH DATAFRAMES
# Print the value of the column for each country
for index, row in data.iterrows():
  print(index)
  print(row)
  print("\n\n")

# Print one specific column per row
for index, row in data.iterrows():
  print(index)
  print(row['gdpPercap_1962'])
  print("\n\n")

# We can format the output like this:
# Print one specific column per row
for index, row in data.iterrows():
  print(index)
  print("gdpPercap_1962: ", row['gdpPercap_1962'])
  print("\n\n")

# Print a couple of columns per row
for index, row in data.iterrows():
  print(index)
  print(row[['gdpPercap_1952', 'gdpPercap_1962']])
  print("\n\n")


pandas version 2.1.4
0
country               Albania
gdpPercap_1952    1601.056136
gdpPercap_1957    1942.284244
gdpPercap_1962    2312.888958
gdpPercap_1967    2760.196931
gdpPercap_1972    3313.422188
gdpPercap_1977     3533.00391
gdpPercap_1982    3630.880722
gdpPercap_1987    3738.932735
gdpPercap_1992    2497.437901
gdpPercap_1997    3193.054604
gdpPercap_2002    4604.211737
gdpPercap_2007    5937.029526
Name: 0, dtype: object



1
country               Austria
gdpPercap_1952    6137.076492
gdpPercap_1957     8842.59803
gdpPercap_1962    10750.72111
gdpPercap_1967     12834.6024
gdpPercap_1972     16661.6256
gdpPercap_1977     19749.4223
gdpPercap_1982    21597.08362
gdpPercap_1987    23687.82607
gdpPercap_1992    27042.01868
gdpPercap_1997    29095.92066
gdpPercap_2002    32417.60769
gdpPercap_2007     36126.4927
Name: 1, dtype: object



2
country               Belgium
gdpPercap_1952    8343.105127
gdpPercap_1957    9714.960623
gdpPercap_1962    10991.20676
gdpPercap_1967    131

FILTERING DATA

In [None]:
import pandas as pd

print ('pandas version', pd.__version__)

# Use this data, GDP for countries in Europe
data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/gapminder_gdp_europe.csv')

# Colab
store = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/cal_housing_small.csv')

# FILTERING DATA
# To check the properties of a dataframe, you can do:
data.info()
# Get rows where gdpPercap_1962 is less than 2,600
# Print all columns
data.loc[data['gdpPercap_1962'] < 2600]

# Print specific columns
data.loc[data['gdpPercap_1962'] < 2600, ['gdpPercap_1962', 'gdpPercap_1982', 'gdpPercap_1997']]

#
#data.loc[data['gdpPercap_1962'] < 2600]['gdpPercap_1962', 'gdpPercap_1982', 'gdpPercap_1997']]

# You can use multiple conditions using the & or | logical operator
# Using the & operator
data.loc[(data['gdpPercap_1962'] < 2600) & (data['gdpPercap_1997'] > 100)]

# Using the or operator
data.loc[(data['gdpPercap_1962'] < 2600) | (data['gdpPercap_1997'] > 100)]

# Filter the countries and find the ones with the word, 'land'
data[data["country"].str.contains("land")] # Returns every country with the word 'land'

data[data["country"].str.contains("Land")] # Is there a match when you run this?

# Case-insensitive match
data[data["country"].str.contains("Land", case=False)] # Is there a match when you run this?

data[data["country"].str.contains("France", case=False)] # Is there a match when you run this?

# You can search through a list of countries using .isin
data[data["country"].isin(["Poland", "Germany", "Austria"])] # Which of the countries is in Poland,
# Germany, Austria? Is there a match?
data[data["country"].isin(["Poland", "Germany"])] #

# You can also filter data using the .query method
data.query('country == "France"') #

Unnamed: 0,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
9,France,7029.809327,8662.834898,10560.48553,12999.91766,16107.19171,18292.63514,20293.89746,22066.44214,24703.79615,25889.78487,28926.03234,30470.0167


ADDING / REMOVING DATA

In [6]:
import pandas as pd

print ('pandas version', pd.__version__)

# Use this data, GDP for countries in Europe
data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/gapminder_gdp_europe.csv')

# Colab
store = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/cal_housing_small.csv')

data.head(10)
# Add a new column
data["population"] = 20000
data.head()

# We can leverage on a numpy helper method:
import numpy as np
data["population"] = np.where(data['country'] == 'Austria', 20000, 35000)
# np.where(Column_Name == some_value, value if True, value if False)
data.head()

data2 = data.copy()
data2["date"] = 2024
data2.head()

# Drop two columns and make a new Data frame data2
data2.drop(columns=['gdpPercap_1972', 'gdpPercap_1977'])

# The .drop method does not update the Dataframe unless you use the inplace=True argument
data2.drop(columns=['gdpPercap_1972', 'gdpPercap_1977'], inplace=True)
display(data2)
display(data)

# If you only need a few specific columns, you can define the column names like this:
data2 = data2[['gdpPercap_1952', 'gdpPercap_1957','gdpPercap_1962', 'gdpPercap_1967','gdpPercap_1982', 'gdpPercap_1987']]
# Create a new column using a formula
data2['normgdp52'] = data2['gdpPercap_1952'] * .001
data2['normgdp52']

display(data2)

# Rename a column of a data frame.
data2.rename(columns={'gdpPercap_1952':'newgdp52'}) # will not modify the original data frame in memory
data2.rename(columns={'gdpPercap_1952':'newgdp52'}, inplace=True) # will modify the original data frame in memory


data2 = data2.rename(columns={'gdpPercap_1952':'newgdp52'})
data2

# Tokyo Olympics
tokyo_data = pd.read_csv('https://raw.githubusercontent.com/laitanawe/pandasds/main/workshops/data/tokyo_olympics.csv')
tokyo_data.head()

tokyo_data['firstname'] = tokyo_data['person_name'].str.split(' ').str[0]
tokyo_data

# Convert string object to title case
tokyo_data['firstname'] = tokyo_data['person_name'].str.split(' ').str[0].str.title()
tokyo_data

tokyo_data.head()
tokyo_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   person_name  20 non-null     object 
 1   country      20 non-null     object 
 2   discipline   20 non-null     object 
 3   born_date    20 non-null     object 
 4   height_cm    20 non-null     float64
 5   weight_kg    20 non-null     float64
 6   firstname    20 non-null     object 
dtypes: float64(2), object(5)
memory usage: 1.2+ KB
