Pandas Series

In [None]:
import pandas as pd
import numpy as np

# From a list
data = [1, 2, 3, 4, 5]
series_from_list = pd.Series(data)
print(series_from_list)

# From a dictionary
data = {'a': 1, 'b': 2, 'c': 3}
series_from_dict = pd.Series(data)
print(series_from_dict)

# From a scalar value
series_from_scalar = pd.Series(5, index=['a', 'b', 'c'])
print(series_from_scalar)


In [None]:
print(series_from_scalar.index)

In [None]:
series_from_scalar.index = ['X', 'Y', 'Z']

In [None]:
print(series_from_scalar)

In [None]:
print(series_from_scalar.index, series_from_scalar.values)

In [None]:
s7 = pd.Series(np.random.randint(0,10,5), index=['a', 'b', 'c', 'd', 'e'])
print(s7)

In [None]:
print(s7.loc['a'], s7.iloc[0])
print(s7.loc['b':'d'])
print(s7.iloc[1:4])

In [None]:
indices = ['b', 'c', 'd']
print(s7.loc[indices])
print(s7.loc[['a', 'c', 'e']])
print(s7.iloc[[0, 2, 4]])

Basic Statistical Operations


In [None]:
print(series_from_list)

In [None]:
print(series_from_list.mean())  # Mean
print(series_from_list.sum())   # Sum
print(series_from_list.max())   # Max
print(series_from_list.min())   # Min
print(series_from_list.std())   # Standard Deviation


Handling Missing Data

In [None]:
data = [1, 2, None, 4, 5]
series_with_nan = pd.Series(data)
print(series_with_nan)
# Checking for missing values
print(series_with_nan.isna())
# Checking If a Series Contains a False Value
print(series_with_nan.isna().values.any())
# Filling missing values
print(series_with_nan.fillna(0))
# Dropping missing values
print(series_with_nan.dropna())


Pandas Dataframe


In [None]:
# From a dictionary
data1 = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'Los Angeles', 'Chicago']
}
df1 = pd.DataFrame(data1)
print(df1)

# From a list of dictionaries
data2 = [
    {'name': 'Alice', 'age': 25, 'city': 'New York'},
    {'name': 'Bob', 'age': 30, 'city': 'Los Angeles'},
    {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
]
df2 = pd.DataFrame(data2)
print(df2)


In [None]:
# From Series with 'abcd' index

data3 = {
  'name': pd.Series(['Alice', 'Bob', 'Charlie'],list('abc')),
  'age': pd.Series([25, 30, 35, 40],list('abcd')),
  'city': pd.Series(['New York', 'Los Angeles', 'Chicago'], list('abc'))
}

df3 = pd.DataFrame(data3)
print(df3)

# From a list of lists
data4 = [
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'Los Angeles'],
    ['Charlie', 35, 'Chicago']
]
columns = ['name', 'age', 'city']
df4 = pd.DataFrame(data4, columns=columns)
print(df4)


In [None]:
# From a dictionary
data1 = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data1)
print(df)

# Accessing a column
print(df['name'])

# Accessing multiple columns
print(df[['name', 'age']])

# Accessing a row by index
print(df.loc[0])

# Accessing multiple rows by index
print(df.loc[0:1])

# Accessing a specific element (row and column)
print(df.at[0, 'name'])

# Accessing a specific element by position (row and column)
print(df.iat[0, 0])


In [None]:
# Adding a new column
df['salary'] = [50000, 60000, 70000]
print(df)

# Modifying an existing column
df['age'] = df['age'] + 1
print(df)

# Insert a column
df.insert(2,'sex',pd.Series(['F','M','M']))
print(df)


In [None]:
# Dropping a column
df = df.drop(columns=['salary'])
print(df)

# Dropping a row
df = df.drop(index=0)
print(df)


In [None]:
# Filtering rows based on a condition
filtered_df = df[df['age'] > 30]
print(filtered_df)

# Conditional selection
selected_df = df[(df['age'] > 30) & (df['city'] == 'Chicago')]
print(selected_df)


In [None]:
#assign data
dataFrame = pd.DataFrame(
   {'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
            '  ROSS    ', 'CHANDLER', ' JOEY    '],                          
   'Age': [30, 35, 37, 33, 34, 30],                         
   'Salary': [100000, 93000, 88000, 120000, 94000, 95000],                        
   'JOB': ['DESIGNER', 'CHEF', 'MASUS', 
           'PALENTOLOGY','IT', 'ARTIST']})

# filter dataframe 
display(dataFrame.query('Salary  <= 100000 & Age < 40 & JOB.str.startswith("C").values'))


In [None]:
# Creating a DataFrame with missing values
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, None, 35],
    'city': ['New York', 'Los Angeles', None]
}
df = pd.DataFrame(data)
print(df)

# Checking for missing values
print(df.isna())

# Filling missing values
df_filled = df.fillna('Unknown')
print(df_filled)

# Dropping rows with missing values
df_dropped = df.dropna()
print(df_dropped)


Creating a DataFrame from a file

In [None]:
#https://raw.githubusercontent.com/The-CEAS-Library/Data_Manipulation_with_Python/master/data/surveys.csv

surveys_df = pd.read_csv("./data/surveys.csv")
surveys_df


In [None]:
print(surveys_df.head())
print(surveys_df.tail(10))


Writing DataFrame with file


In [None]:
surveys_df_head = pd.read_csv("surveys.csv").head(30)
surveys_df_head.to_csv('surveys_head.csv', index=False)


In [None]:
# Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file without the index
df.to_excel('output.xlsx', sheet_name='People', index=False)


Accessing Data Attributes


In [None]:
surveys_df.dtypes


In [None]:
surveys_df.columns


In [None]:
surveys_df.columns[0:4]


In [None]:
surveys_df.shape


In [None]:
surveys_df['record_id']


In [None]:
surveys_df[['month', 'day']]


In [None]:
surveys_df.record_id


Selecting Unique values from a column


In [None]:
print(pd.unique(surveys_df['species_id']))
print(pd.unique(surveys_df.species_id))
print(surveys_df['species_id'].unique())

In [None]:
print(surveys_df[0:5])
print(surveys_df[3:7])
print(surveys_df[5:])
print(surveys_df[:5])
print(surveys_df[-1:])
print(surveys_df[-3:])
print(surveys_df[-3:-1])

In [None]:
print(surveys_df['record_id'])
print(surveys_df['record_id'][0])
print(surveys_df.record_id[0])
print(surveys_df['record_id'][0:4])
print(surveys_df[['record_id','plot_id']][0:5])


In [None]:
print(surveys_df.iloc[2, 6])
print(surveys_df.iloc[1:4, 0:5])
print(surveys_df.iloc[3:5, 5:8])
print(surveys_df.iloc[3:5, :])
print(surveys_df.iloc[:, 5:8].head())
print(surveys_df.iloc[[2,5,23,1], [8,5,6,7]])
print(surveys_df.iloc[[2,5,23], :])
print(surveys_df.iloc[[2,5,23], 5:8])


In [None]:
print(surveys_df)
print(surveys_df.loc[2:5,['species_id', 'plot_id', 'weight']])
print(surveys_df.loc[[2,5,5,7],['species_id', 'plot_id', 'weight']])
print(surveys_df.loc[2:5,:])




Creating Queries

In [None]:
filtered = surveys_df.loc[surveys_df['year'] > 2001]
print(filtered)
filtered1 = surveys_df[surveys_df['weight'] == 88]
print(filtered1)
filtered2 = surveys_df[surveys_df.month == surveys_df.plot_id].head()
print(filtered2)
filtered3 = surveys_df[(surveys_df.weight == 88) & (surveys_df.month == surveys_df.plot_id)]
print(filtered3)


Creating a DataFrame from URL


In [None]:
url = 'https://raw.githubusercontent.com/datagy/data/main/data.csv'
df = pd.read_csv(url)


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.sample(10)

Sort data. It accepts a number of parameters 
by = allows to pass in a single column label or a list of column labels
ascending = indicate if values should be sorted in ascending or descending order


In [None]:
# Sort data. It accepts a number of parameters 
new_df1 = df.sort_values(by=['Sales','Units'], ascending=[False,True])
print(new_df1.head())
new_df2 = df.sort_values(by='Sales', ascending=False)
print(new_df2.head())


In [None]:
df_renamed = df.rename(columns={'Units': 'Items', 'Sales': 'Amount'})
print(df_renamed.head(5))

In [None]:
df_renamed_index = df.rename(index={996: 'Item 1', 309: 'Item 2', 745: 'Item 3', 222: 'Item 4'})
print(df_renamed_index.head())

In [None]:
# Set the 'Name' column as the index
new_df1_indexed = new_df1.set_index('Date')

# Display the DataFrame with the new index
print("\nDataFrame with 'Date' as index:")
print(new_df1_indexed)
print(new_df1_indexed.index)

In [None]:
# Set the 'Name' column as the index
df_multi_indexed = df.set_index(['Region', 'Type'])

# Display the DataFrame with the new index
print("\nDataFrame with 'Region' and 'Type' as index:")
print(df_multi_indexed.head(5))
print(df_multi_indexed.index)

In [None]:
df_reset = df_multi_indexed.reset_index()
print(df_reset)

astype - change the data type of a column

In [None]:
# Create a DataFrame
data_for_type = {
    'A': [1, 2, 3],
    'B': [1.1, 2.2, 3.3],
    'C': ['a', 'b', 'c'],
    'D': [True, False, True]
}

df_data_for_type = pd.DataFrame(data_for_type)
# Check data types
print(df_data_for_type.dtypes)

In [None]:
# Convert column 'A' to float
df_data_for_type['A'] = df_data_for_type['A'].astype(float)
print(df_data_for_type.dtypes)
# Convert column 'C' to category
df_data_for_type['C'] = df_data_for_type['C'].astype('category')
print(df_data_for_type.dtypes)

Handling Missing Data

In [None]:
df.info()

In [None]:
df['Units'].isna()

In [None]:
missing_rows = df[df['Units'].isna()]

# Display the rows with all missing values
print("\nRows with all missing values:")
print(missing_rows)

In [None]:
# Drop whole row if any column data is missing data
df_dropped_columns = df.dropna(axis=0) 
print(df_dropped_columns)

In [None]:
# Filling Missing Values in a Specific Column
df['Units'] = df['Units'].fillna(value=0)
print(df)

In [None]:
url = 'https://raw.githubusercontent.com/datagy/data/main/data.csv'
df = pd.read_csv(url)

In [None]:
df.info()

In [None]:
print(df['Units'].mean()) # For single column



In [None]:
# To an entire DataFrame, we need to use the numeric=True argument that apply numeric
#print(df.mean())
print(df.mean(numeric_only=True))


In [None]:
print(df['Region'].count())
print(df['Units'].count())

In [None]:
# Aggregate multiple functions on the 'Score' column
agg_results = df['Sales'].agg(['mean', 'sum', 'max', 'min'])

# Display the aggregated results
print("\nAggregated results for 'Sales':")
print(agg_results)

In [None]:
aggregate = df.groupby('Region')['Sales'].mean()
print(aggregate)


In [None]:
# Group by Region and Type, then calculate the average Sales
grouped_multi = df.groupby(['Region', 'Type'])['Sales'].mean().reset_index()

# Display the result
print(grouped_multi)

In [None]:
# Group by Type and aggregate with different functions
agg_results = df.groupby('Type').agg({
    'Sales': ['mean', 'max', 'min'],
    'Units': 'sum'
}).reset_index()

# Display the aggregated results
print(agg_results)

In [None]:
# Create a pivot table to show average score by age
pivot_table = df.pivot_table(values='Sales', index='Region', aggfunc='mean')

# Display the pivot table
print("\nPivot table showing average scorSalese by Region:")
print(pivot_table)

In [None]:
# Create a pivot table with multiple aggregation functions
pivot_multiple = pd.pivot_table(df, values='Sales', index='Region', aggfunc=['max', 'min', 'mean'])

# Display the pivot table with multiple aggregation functions
print("\nPivot table with multiple aggregation functions:")
print(pivot_multiple)

In [None]:
# Create a pivot table with margins
pivot_margins = pd.pivot_table(df, values=['Sales','Units'], 
               index='Region', aggfunc=['sum'], margins=True)

# Display the pivot table with margins
print("\nPivot table with margins:")
print(pivot_margins)

In [None]:
# Create a pivot table to show average score by city and age
pivot_age_city = df.pivot_table(values='Sales', index='Type', columns='Region', 
                                aggfunc='sum', margins=True, margins_name="Total")

# Display the pivot table
print(pivot_age_city)

unpivots a DataFrame from wide format to long format

In [None]:
df = pd.DataFrame({"A":[12, 4, 5, 44, 1],  
                   "B":[5, 2, 54, 3, 2],  
                   "C":[20, 16, 7, 3, 8],  
                   "D":[14, 3, 17, 2, 6]}) 
df

In [None]:
# function to unpivot the dataframe 
df.melt(id_vars =['A'], value_vars =['B']) 


In [None]:
df.melt(id_vars =['C'], value_vars =['A','B']) 

In [None]:

# function to unpivot the dataframe 
# We will also provide a customized name to the value and variable column 
  
df.melt(id_vars =['A'], value_vars =['B', 'C'],  var_name ='Variable_column', value_name ='Value_column') 


Time Series Analysis

In [None]:
df_with_no_date= pd.read_csv('https://raw.githubusercontent.com/datagy/data/main/data.csv')
print(df_with_no_date.info())


In [None]:
df_with_no_date['Date'] = pd.to_datetime(df_with_no_date['Date'])
print(df_with_no_date.info())

In [None]:
df_with_date= pd.read_csv('https://raw.githubusercontent.com/datagy/data/main/data.csv', parse_dates=['Date'])
print(df_with_date.info())

df_with_date['Year'] = df_with_date['Date'].dt.year
df_with_date['Month'] = df_with_date['Date'].dt.month
print(df_with_date.head())

print(df_with_date[df_with_date['Date'] > '2020-12-01'].head())


In [None]:
print(pd.unique(df_with_date['Month']))
print(pd.unique(df_with_date.Month))
print(df_with_date['Month'].unique())

In [None]:
duplicateRows_all = df_with_date[df_with_date.duplicated(subset=['Date','Type','Region'],keep=False)]
duplicateRows_all


In [None]:
df_with_date_no_duplicates = df_with_date.drop_duplicates(subset=['Date', 'Type', 'Region'], keep='first').sort_values(by=['Date', 'Type', 'Region'], ascending=True)
print("Original DataFrame:")
print(df_with_date)


In [None]:
print("\nDataFrame after dropping duplicates:")
df_with_date_no_duplicates


Advanced data handling

In [None]:
df = pd.DataFrame(
   
   [('carrot', 'red', 1), ('papaya', 'yellow', 0),
    ('mango', 'yellow', 0), ('apple', 'red', 0)
   ], 

    columns=['species', 'color', 'type']
)
print("Dataframe before Mapping: ")
print(df)

In [None]:
# Define a function to apply
def add_one(x):
    return x + 1

# Apply the function to each element in the DataFrame
df['type'] = df['type'].apply(add_one)
print(df)


In [None]:

mappings = {
    'carrot': 'veg',
    'papaya': 'fruit'
}
 
df['type_name'] = df['species'].map(mappings)
print("Dataframe after Mapping: ")
print(df)