## Pandas
- Powerful data manipulation library used for data cleaning and analysis
- Provides two data structures: Series and Data frame
- Series : 1D array like object
- Data Frame : 2D, size-mutalbe and potentially hetrogenious tabular data structure with labled axis

In [3]:
import pandas as pd

In [None]:
# pandas series
data = [1,2,3,4,5,6]
series = pd.Series(data)
print(series)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [5]:
# series from dictionary
data = {'a':1, 'b':2, 'c':3}
dict_series = pd.Series(data)
print(dict_series)

a    1
b    2
c    3
dtype: int64


In [6]:
data = [1,2,3,4,5]
series2 = pd.Series(data, index=['a','b','c','d','e'])

print(series2)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [7]:
# data frame
data = {
    "name":['abhishek', 'kunal', 'rajat', 'mohit'],
    "age": [22, 21 ,23, 20],
    "city": ['Delhi', 'Palwal', 'Chandigarh', 'Faridabad']
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,abhishek,22,Delhi
1,kunal,21,Palwal
2,rajat,23,Chandigarh
3,mohit,20,Faridabad


In [8]:
import numpy as np

In [12]:
data_as_Array = np.array(df)
print(data_as_Array.shape)
print('data as array: \n', data_as_Array)

(4, 3)
data as array: 
 [['abhishek' 22 'Delhi']
 ['kunal' 21 'Palwal']
 ['rajat' 23 'Chandigarh']
 ['mohit' 20 'Faridabad']]


In [14]:
# creating dataframe from list of dictionary
data = [
    {'name':'abhishek', 'age':22, 'city': 'delhi'},
    {'name':'Vivek', 'age':21, 'city': 'banglore'},
    {'name':'Jeff', 'age':30, 'city': 'Chicago'},
    {'name':'John', 'age':32, 'city': 'Los Angeles'},
]

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,abhishek,22,delhi
1,Vivek,21,banglore
2,Jeff,30,Chicago
3,John,32,Los Angeles


In [18]:
# reading data from csv file
sales_df = pd.read_csv('sales_data.csv')
sales_df.head(5)

Unnamed: 0,Product_ID,Sale_Date,Sales_Rep,Region,Sales_Amount,Quantity_Sold,Product_Category,Unit_Cost,Unit_Price,Customer_Type,Discount,Payment_Method,Sales_Channel,Region_and_Sales_Rep
0,1052,2023-02-03,Bob,North,5053.97,18,Furniture,152.75,267.22,Returning,0.09,Cash,Online,North-Bob
1,1093,2023-04-21,Bob,West,4384.02,17,Furniture,3816.39,4209.44,Returning,0.11,Cash,Retail,West-Bob
2,1015,2023-09-21,David,South,4631.23,30,Food,261.56,371.4,Returning,0.2,Bank Transfer,Retail,South-David
3,1072,2023-08-24,Bob,South,2167.94,39,Clothing,4330.03,4467.75,New,0.02,Credit Card,Retail,South-Bob
4,1061,2023-03-24,Charlie,East,3750.2,13,Electronics,637.37,692.71,New,0.08,Credit Card,Online,East-Charlie


## Accessing data from the dataframes

In [19]:
# accessing data from dataframe
df.head()

Unnamed: 0,name,age,city
0,abhishek,22,delhi
1,Vivek,21,banglore
2,Jeff,30,Chicago
3,John,32,Los Angeles


In [17]:
all_names = df['name']
print(all_names)
print(type(all_names))

0    abhishek
1       Vivek
2        Jeff
3        John
Name: name, dtype: object
<class 'pandas.core.series.Series'>


In [20]:
# accessing using loc and iloc
# loc is label based 
# while iloc is postion(index) based

data = {
    "name":['abhishek', 'kunal', 'rajat', 'mohit'],
    "age": [22, 21 ,23, 20],
    "city": ['Delhi', 'Palwal', 'Chandigarh', 'Faridabad']
}

df = pd.DataFrame(data, index = ['a', 'b', 'c', 'd'])
df

Unnamed: 0,name,age,city
a,abhishek,22,Delhi
b,kunal,21,Palwal
c,rajat,23,Chandigarh
d,mohit,20,Faridabad


### Using LOC

In [24]:
# accessing single row using loc
df.loc['c']

name         rajat
age             23
city    Chandigarh
Name: c, dtype: object

In [25]:
# accessing multiple rows using loc
df.loc[['b', 'c']]

Unnamed: 0,name,age,city
b,kunal,21,Palwal
c,rajat,23,Chandigarh


In [35]:
# specific element in a row
print(df.loc['b', 'age'])


# specific element from multiple rows
print(df.loc[['b', 'c'],['age', 'name']])

21
   age   name
b   21  kunal
c   23  rajat


### Using iLOC
- Index based
- elements using there postion or index

In [36]:
df

Unnamed: 0,name,age,city
a,abhishek,22,Delhi
b,kunal,21,Palwal
c,rajat,23,Chandigarh
d,mohit,20,Faridabad


In [37]:
df.iloc[0]

name    abhishek
age           22
city       Delhi
Name: a, dtype: object

In [38]:
# slicing using iloc
df.iloc[1:3]

Unnamed: 0,name,age,city
b,kunal,21,Palwal
c,rajat,23,Chandigarh


In [None]:
# accessing specific element inside dataframe
df.iloc[[1],[0]]

Unnamed: 0,name
b,kunal


In [46]:
df.iloc[[1],[0,1]]

Unnamed: 0,name,age
b,kunal,21


In [48]:
# Accessing specific element using at and iat

print(df.at['a', 'name'])
print(df.iat[0,0])

abhishek
abhishek


In [49]:
# data manipulation
# adding salary column

df['salary'] = [50000, 60000, 70000, 80000]
df

Unnamed: 0,name,age,city,salary
a,abhishek,22,Delhi,50000
b,kunal,21,Palwal,60000
c,rajat,23,Chandigarh,70000
d,mohit,20,Faridabad,80000


In [50]:
df.drop('age', axis=1)

Unnamed: 0,name,city,salary
a,abhishek,Delhi,50000
b,kunal,Palwal,60000
c,rajat,Chandigarh,70000
d,mohit,Faridabad,80000


In [51]:
df

Unnamed: 0,name,age,city,salary
a,abhishek,22,Delhi,50000
b,kunal,21,Palwal,60000
c,rajat,23,Chandigarh,70000
d,mohit,20,Faridabad,80000


**Column age is not dropped because we have not made the changes permanent**

In [52]:
# method 1: save the changes in df or make new variable

df_without_age = df.drop('age', axis=1)
df_without_age

Unnamed: 0,name,city,salary
a,abhishek,Delhi,50000
b,kunal,Palwal,60000
c,rajat,Chandigarh,70000
d,mohit,Faridabad,80000


In [53]:
# method 2: save the changes by making inplace = true
df.drop('age', axis=1, inplace=True)
df

Unnamed: 0,name,city,salary
a,abhishek,Delhi,50000
b,kunal,Palwal,60000
c,rajat,Chandigarh,70000
d,mohit,Faridabad,80000


In [54]:
# giving increment

df['salary'] = df['salary'] + df['salary']/10

In [55]:
df

Unnamed: 0,name,city,salary
a,abhishek,Delhi,55000.0
b,kunal,Palwal,66000.0
c,rajat,Chandigarh,77000.0
d,mohit,Faridabad,88000.0


## Important Attriburtes

In [56]:
print("data type of each column: \n",sales_df.dtypes)

data type of each column: 
 Product_ID                int64
Sale_Date                object
Sales_Rep                object
Region                   object
Sales_Amount            float64
Quantity_Sold             int64
Product_Category         object
Unit_Cost               float64
Unit_Price              float64
Customer_Type            object
Discount                float64
Payment_Method           object
Sales_Channel            object
Region_and_Sales_Rep     object
dtype: object


In [60]:
print('statistical description of data: \n', sales_df.describe())

statistical description of data: 
         Product_ID  Sales_Amount  Quantity_Sold    Unit_Cost   Unit_Price  \
count  1000.000000   1000.000000    1000.000000  1000.000000  1000.000000   
mean   1050.128000   5019.265230      25.355000  2475.304550  2728.440120   
std      29.573505   2846.790126      14.159006  1417.872546  1419.399839   
min    1001.000000    100.120000       1.000000    60.280000   167.120000   
25%    1024.000000   2550.297500      13.000000  1238.380000  1509.085000   
50%    1051.000000   5019.300000      25.000000  2467.235000  2696.400000   
75%    1075.000000   7507.445000      38.000000  3702.865000  3957.970000   
max    1100.000000   9989.040000      49.000000  4995.300000  5442.150000   

         Discount  
count  1000.00000  
mean      0.15239  
std       0.08720  
min       0.00000  
25%       0.08000  
50%       0.15000  
75%       0.23000  
max       0.30000  


In [63]:
sales_df.describe()

Unnamed: 0,Product_ID,Sales_Amount,Quantity_Sold,Unit_Cost,Unit_Price,Discount
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1050.128,5019.26523,25.355,2475.30455,2728.44012,0.15239
std,29.573505,2846.790126,14.159006,1417.872546,1419.399839,0.0872
min,1001.0,100.12,1.0,60.28,167.12,0.0
25%,1024.0,2550.2975,13.0,1238.38,1509.085,0.08
50%,1051.0,5019.3,25.0,2467.235,2696.4,0.15
75%,1075.0,7507.445,38.0,3702.865,3957.97,0.23
max,1100.0,9989.04,49.0,4995.3,5442.15,0.3
