# <div style="text-align: center;">The Basics of PANDAS</div>
## <div style="text-align: center;">My GitHub: [https://github.com/ilyassan](https://github.com/ilyassan)</div>

# Get Started With PANDAS

In [None]:
import pandas as pd

# For CSV files
df = pd.read_csv("pokemon_data.csv")

# For TXT files using (separate operator)
df = pd.read_csv("pokemon_data.txt", sep='\t')

# For Excel files
df = pd.read_excel("pokemon_data.xlsx")

# All supported files in pandas:
# CSV, XLSX, XLS, TXT, JSON, HTML, HDF5, Feather, Parquet, ORC, Stata, SAS, SPSS, Pickle, SQL, Clipboard, XML.

## Describing Data

In [None]:
# Print table
df

# Global view of the table (Count, Mean, Standard Deviation, Min value, Max value, Percentile)
df.describe()

# Get array of table columns
df.columns

# Get rows and columns count ( rows, columns)
df.shape


## Getting Data

In [None]:
# Get the first (n) rows
df.head(5)

# Get the last (n) rows
df.tail(5)

# Get a specefiques columns in the table
df[["Name", "HP"]]

# Get row by index "start from 0" (  iloc ==> integer location )
df.iloc[799]

# Get multiple rows by iloc range [ from:to ]
df.iloc[1:2]

# Get specefique column in a row "start from 0" [ row index, column index ]
df.iloc[1, 2]

# Get each row
for i, row in df.iterrows():
    print(i, row["Name"])

# Get rows with specefique condition df.loc[condition]
df.loc[df["Type 1"] == "Rock"]

# Get specefique columns of rows with specefique column string df.loc[condition, [columns] ]
df.loc[df["Type 1"] == "Rock", ["Name", "Type 1"]]


## Sorting Data

In [None]:
# Sort table by column name  sort_values( column, ascending=(True or False) ) default is ascending order
df.sort_values("Type 1", ascending=False)

# Sort table by multiple columns sort_values( [column1, column2, ...], ascending=[column1 order, column2 order, ...] )
# 1 => True , 0 => False
df.sort_values(["Type 1", "HP"], ascending=[1, 0] )
# OR
df.sort_values(["Type 1", "HP"], ascending=[True, False])


# Group By

In [None]:
### Examples

# Group by type column and sort by the count of each type
df.groupby('Type 1').size().sort_values(ascending=False)

# Group by multiple columns
df.groupby(['Type 1', 'Type 2']).size().sort_values(ascending=False)

# Group by type column and sort by the mean attack column
df.groupby('Type 1')[['Attack']].mean().sort_values("Attack", ascending=False)

# Common Aggregation Functions With groupby() :
# mean(), median(), std(), sum(), count(), size(), min(), max()

## Change Data

In [None]:
# Add new column df['Column Name']
df['Total'] = "Anything"
# OR
# insert( index where to  insert, 'Column Name', Value or [ Values ] )
df.insert(4, "Total", ["Anything"] * len(df) )

# Remove a column
df = df.drop(columns=['Total'])
# OR
df.drop(columns=['Total'], inplace=True)

# Sum columns to the new column
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
# OR
df['Total'] = df.iloc[:, 4:10].sum(axis=1)
# OR
df.insert(10, "Total", df.iloc[:, 4:10].sum(axis=1) )

# Change column position
df = df[[ "#", "Name", "Type 1", "Type 2", "HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed", "Total", "Generation", "Legendary" ]]
# OR
cols = list(df.columns)
df = df[ cols[0:10] + [cols[-1]] + cols[10:12] ]


df.head()

## Filtering Data

In [None]:
# Filter rows with multiple conditions df.loc[condition & condition2]
df.loc[ ( (df["Type 1"] == "Grass") | (df["Type 2"] == "Poison") ) & (df["HP"] >= 100) ]
# and => &
# or  => |

### Filter rows that contains a string

# rows that contains "Mega" in the name
df.loc[ df["Name"].str.contains("Mega") ]
# rows that not contains "Mega" in the name
df.loc[ ~df["Name"].str.contains("Mega") ]

# Filter using regex pattern
import re

df.loc[ df["Name"].str.contains("^pi[a-z]*", flags=re.I, regex=True) ]


## Conditional Changes

In [None]:
# If column == x then column = newValue
df.loc[ df["Type 1"] == "Normal", "Type 1" ] = "Regular"

df.loc[ df["Type 1"] == "Fire", ["Generation", "Legendary"] ] = [ 2, True]

df

## Dealing With Big Data

In [189]:
# When dealing with very large data in pandas its essential to use chunking to avoid memory leaks and some problems.
# Example of chunking:

total_sum = 0

for chunk in pd.read_csv('pokemon_data.csv', chunksize=100):
    # Process each chunk independently
    total_sum += chunk['HP'].sum()

print("Total sum:", total_sum)


Total sum: 55407


## Saving Data

In [None]:
# Save in CSV format
df.to_csv("modified.csv", index=False)

# Save in TXT format
df.to_csv("modified.txt", sep='\t', index=False)

# Save in EXCEL format
df.to_excel("modified.xlsx", index=False)

# index=False  --> For delete the first column of index added by pandas

# <p style="text-align: center;" >I Hope You Benefit From This.</p>