In [None]:
# Using the .describe() function in pandas     # calling the .describe() on your dataset will produce a series of descriptive statistics thar allow you to get to know your data better
# # link to the .ipynb file 'https://github.com/Data-Indepedent/pandas_everything/blob/master/pandas_functions/Pandas_Describe.ipynb'

# load pandas
import pandas as pd

In [None]:
# load dataset
url = 'https://raw.githubusercontent.com/Data-Indepedent/pandas_everything/refs/heads/master/data/Street_Tree_List.csv'

# In pandas, "parsing" refers to the process of interpreting and converting raw data from various sources (like CSV files, Excel spreadsheets, JSON, or even strings) into a structured format that pandas can work with, primarily DataFrames and Series. 
# here you are parsing one of the dates which is turning a column into a date
df = pd.read_csv(url, parse_dates=['PlantDate']) 

# only selecting a subset of columns
df = df[['TreeID', 'qSpecies', 'PlantDate', 'DBH']]

# renaming a column
df.rename(mapper={'DBH':"tree_depth"}, axis=1, inplace=True)

df.head(10)

In [None]:
# defaul describe     # by default, .describe() will tell us a seroes pf descriptive statistics
df.describe()

In [None]:
# includinging all columns via 'include'     # if you want to include all columns in describe, then set include='all
df.describe(include='all')

In [None]:
# treating datetimes like numbers via datetime_is_numeric=True     # here you are calling .describe() on a Series
# in this example we are calling .describe() on our "PlantDate" column to see the difference between treating dates like objects and treating them like numbers
df['PlantDate'].describe()


In [None]:
# 'datetime_is_numeric' is a very helpful parameter in pandas
# this will treat our dates like numbers
df['PlantDate'].describe(datetime_is_numeric=True)

In [None]:
# The .describe() method gives you a quick statistical summary of all numeric columns in your DataFrame.

# This will show:
# # count: How many non-null values there are
# # mean: The average value
# # std: The standard deviation (you can ignore this for now!)
# # min and max: The smallest and largest values
# # 25%, 50%, 75%: The 25th, 50th (median), and 75th percentiles

# It’s a fast way to check for things like:
# # Missing or unusual values (e.g., a minimum age of 0)
# # Ranges that are wider or narrower than you expected
# # Columns where the average doesn’t match your assumptions


In [None]:
# .value_counts() is made for categories — like gender, class, port of embarkation, or anything stored as text
# This shows:
# # The unique values in that column
# # How many times each one appears
# # By default, it sorts from most frequent to least frequent, but you can change that if you want: df['Embarked'].value_counts(ascending=True)\

# This is super useful for:
# # Checking class imbalance (e.g., way more men than women)
# # Finding typos (e.g., "Yes" vs "yes" vs "y")
# # Seeing how evenly a feature is distributed

In [None]:
# You can also directly get specific stats on a single column:
# Example:
# # df['Age'].mean()       # Average age
# # df['Age'].median()     # Middle value
# # df['Age'].min()        # Youngest passenger
# # df['Age'].max()        # Oldest passenger
# # df['Age'].nunique()    # Number of unique ages
# # df['Embarked'].nunique()  # Number of unique embarkation ports

# These functions — like .mean(), .max(), and .nunique() — are called aggregation functions because they summarize or condense data. 
# You’ll see them pop up again soon when we use .groupby() to calculate these stats across categories. For now, just keep in mind that these are part of your core data analysis toolkit.

In [None]:
# how to verify pandas version

# import pandas as pd
# print("Pandas version:", pd.__version__)
# help(pd.Series.describe)

# import pandas as pd
# print("Pandas version:", pd.__version__)

# import sys
# print("Python executable:", sys.executable)

# import site
# print("Site-packages:", site.getsitepackages())

# import pandas as pd
# print(pd.__version__)