# Getting started with Pandas

## 1. Creating data

To use pandas, you'll typically start with the following line of code.

In [1]:
#pip install numpy
#pip install pandas

import pandas as pd
import numpy as np

### 1.1. DataFrame

There are two core objects in pandas: the <font color='coral'><b> DataFrame </b></font> and the <font color='coral'><b> Series </b></font>.

A DataFrame is a table. It contains an array of individual entries, each of which has a certain value. Each entry corresponds to a row (or record) and a column.


#### From a list of lists

In [None]:
# Each nested List constructs a row without name

data = [['Alice', 25, 'New York'],
        ['Bob', 30, 'Paris'],
        ['Charlie', 35, 'London']]
df = pd.DataFrame(data= data)
df

In [None]:
data = [['Alice', 25, 'New York'],
        ['Bob', 30, 'Paris'],
        ['Charlie', 35, 'London']]

df = pd.DataFrame(data= data, columns=['name', 'age', 'city'])
df

In [None]:
data = [['Alice', 25, 'New York'],
        ['Bob', 30, 'Paris'],
        ['Charlie', 35, 'London']]

df = pd.DataFrame(data= data,
                  columns=['name', 'age', 'city'],
                  index  =['Student1','Student2','Student3'])
df

#### From a dictionary of lists

In [None]:
# Each "key:value" pair, constructs a column by default

data = {'name': ['Alice', 'Bob', 'Charlie'],
        'age' : [25, 30, 35],
        'city': ['New York', 'Paris', 'London']}

df = pd.DataFrame(data = data)
df

In [None]:
data = {'name': ['Alice', 'Bob', 'Charlie'],
        'age' : [25, 30, 35],
        'city': ['New York', 'Paris','London']}

df = pd.DataFrame(data= data, index=['one','two','three'])
df

### 1.2. Series

* A Series, by contrast, is a sequence of data values. If a DataFrame is a table, a Series is a column.
* The Series and the DataFrame are intimately related. It's helpful to think of a DataFrame as actually being just a bunch of Series <font color='coral'><b> glued together </b></font>.

In [None]:
pd.Series([1, 2, 3, 4, 5] , name='Test1')

In [None]:
one = pd.Series([1, 2, 3, 4, 5] , name='Test1')
two = pd.Series([6, 7, 8, 9, 10] , name='Test2')
test_table = pd.concat([one,two] , axis=1)
test_table

## 2. Reading data files

In [None]:
# pip install "openpyxl"

my_df = pd.read_excel('Monthly_Sales.xlsx')
my_df

In [None]:
my_df = pd.read_csv('salesmonthly.csv')
my_df

In [None]:
# pd.read_sql()
# pd.read_sql_query()
# pd.read_sql_table()

# pd.read_sas()
# pd.read_spss()
# pd.read_stata()

# pd.read_pickle()
# pd.read_json()

## 3. Useful attributes and methods

In [None]:
print(my_df.shape)
print('number of df rows:', my_df.shape[0])
print('number of df columns:', my_df.shape[1])

In [None]:
# my_df.shape()
print(type(my_df.shape))

In [None]:
my_df.head(n=3)

In [None]:
my_df.tail(n=3)

In [None]:
my_df.sample(n=3)

In [None]:
my_df.sample(n=3)

In [None]:
my_df.sample(n=4, random_state=1)

In [None]:
my_df.sample(frac=0.03, random_state=42)

In [None]:
my_df.info()

In [None]:
my_df.describe()

In [None]:
my_df.describe(include = 'all')

In [None]:
my_df.describe(percentiles=[0.125,0.17,0.05,0.6])

* Copy method

In [None]:
my_df_1 = my_df.copy()
my_df_1.head(3)

In [None]:
my_df_2 = my_df
my_df_2.head(3)

In [None]:
my_df_1.rename(columns = {'date': 'invoice_date'}, inplace=True)
my_df_1.head(3)

In [None]:
my_df.head(3)

In [None]:
my_df_2.rename(columns = {'date': 'invoice_date'}, inplace=True)
my_df_2.head(3)

In [None]:
my_df.head(3)

## 4. Accessing DataFrame

In [None]:
my_df = pd.read_csv('salesmonthly.csv')
my_df

In [None]:
# df['columnname']  -->  pd.Series 
my_df['product1']

In [None]:
type(my_df['product1'])

In [None]:
#df.columnname  -->  pd.Series 
my_df.product1

In [None]:
# column names in Farsi or non-continueos titles
my_df.product 8

In [None]:
my_df['product 8']

In [None]:
# my_df[['product1']] --> pd.DataFrame with single column
my_df[['product1']]

In [None]:
type(my_df[['product1']])

In [None]:
my_df['product1','product2']

In [None]:
my_df[['product1','product2']]

In [None]:
my_df.product1.describe()

In [None]:
my_df.product1.max() , my_df.product1.argmax()

In [None]:
my_df.Customer.unique()  # SELECT DISTINCT Customer FROM my_df

In [None]:
my_df.Customer.nunique() # SELECT COUNT(DISTINCT Customer) FROM my_df

In [None]:
my_df.Customer.value_counts()

In [None]:
my_df.Customer.value_counts(normalize=True)

In [None]:
# Adding new column to DataFrame
my_df['new_column'] = (my_df.product1 - my_df.product1.mean()) / my_df.product1.std()
my_df.head(3)

## 5. Indexing in Pandas

The indexing operator and attribute selection work just like the rest of the Python ecosystem.  
However, pandas has its own accessor operators, <font color='coral'><b> loc </b></font> and <font color='coral'><b> iloc </b></font> for more advanced operations.


### 5.1. Index-based selection

selecting data based on its numerical position in the data. <font color='coral'><b> iloc </b></font> follows this paradigm.  


In [None]:
my_df = pd.read_csv('salesmonthly.csv')
my_df.head()

In [None]:
# output: Series
my_df.iloc[1]

In [None]:
#output : DataFrame
my_df.iloc[[1]]

In [None]:
my_df.iloc[1,]

In [None]:
my_df.iloc[1:3,0:6]

In [None]:
my_df.iloc[[1,4,12],[1,3,7]]

In [None]:
print(my_df.product1.max())
print(my_df.product1.argmax())

my_df.iloc[[my_df.product1.argmax()]]

In [None]:
my_df.iloc[-5:]

In [None]:
my_df[-5:]  # Ask ChatGPT it works why without iloc

In [None]:
my_df[:,1:7]

In [None]:
my_df.iloc[:,1:7]

### 5.2. Label-based selection

This paradigm for attribute selection is the one followed by the <font color='coral'><b> loc </b></font> operator: label-based selection. In this paradigm, it's the data index value, not its position, which matters.

In [None]:
my_df.head()

In [None]:
'3' - 1

In [None]:
my_df.loc[0:3,'product1']

In [None]:
my_df.loc[0:3,['product1']]

In [None]:
my_df.iloc[0:3,[1]]

In [None]:
my_df_2 = pd.read_csv('salesmonthly.csv', index_col='date') #index_col = 0
my_df_2

In [None]:
my_df_2.loc[['1/31/2014','3/31/2014'],['product1','product5']]

In [None]:
my_df_2.loc['1/31/2014':'3/31/2014',['product1','product5']]

In [None]:
# my_df_2.loc['1/31/2014':'3/31/2014',['product1','product5']]
my_df_2.iloc[0:3, 1:6]

### 5.3 Choosing between loc and iloc

<font color='coral'><b> iloc </b></font> uses the Python stdlib indexing scheme, where the **first element of the range is included and the last one excluded.** So 0:10 will select entries 0,...,9. (10 entries)  
<font color='coral'><b> loc </b></font>,meanwhile, **indexes inclusively.** So 0:10 will select entries 0,...,10. (10+1 entries)

In [None]:
my_df_2.loc['1/31/2014':'5/31/2014','product1':'product4']

### 5.4 Manipulating the index

In [None]:
my_df = pd.read_csv('salesmonthly.csv')
my_df.head(4)

In [None]:
my_df.set_index('Customer')

In [None]:
my_df

In [None]:
# 1st way to set index (Creating new df)
edited_table = my_df.set_index('Customer')
edited_table.head(3)

In [None]:
my_df.head(3)

In [None]:
edited_table = edited_table.reset_index()
edited_table

In [89]:
# 2nd Way to set index (inplace)
my_df.set_index('Customer', inplace=True)

In [None]:
my_df.head()

In [None]:
my_df.reset_index(inplace=True)
my_df.head()

In [None]:
my_df.reset_index(inplace=True)
my_df

In [None]:
# if reset_index is applied twice an extra colum will be added
# run the code below to see extra column

# my_df.reset_index(inplace=True)

my_df.reset_index(inplace=True , drop = True)
my_df.head()

In [None]:
# The correct way to deal with set and reset index is as follows

my_df = pd.read_csv('salesmonthly.csv')
my_df.head(3)

In [None]:
my_df.set_index('Customer', inplace=True)
my_df.head()

In [None]:
my_df.reset_index(drop = True,inplace = True)
my_df.head()

In [None]:
my_df = pd.read_csv('salesmonthly.csv')
my_df.set_index('Customer', inplace=True)
my_df

In [None]:
my_df.loc[['mammad'],['product1']]