<a href="https://colab.research.google.com/github/hikkaaaa/machine-learning/blob/main/pandas_library_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **How to change and reset index**

In [249]:
import pandas as pd
import numpy as np
my_df = pd.read_csv('./drive/MyDrive/Colab Notebooks/dog.csv', sep = ";")
my_df

Unnamed: 0,Breed,Color,DogName,OwnerZip
0,COCKAPOO,BROWN,CHARLEY,15236
1,GER SHEPHERD,BLACK/BROWN,TACODA,15237
2,BELG MALINOIS,BRINDLE,EICH,15238
3,MIXED,BLACK/BROWN,ARROW,15239
4,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240
5,SCOTTISH TERRIER,BLACK,BAILEY,15241
6,YORKSHIRE TERRIER,BLACK/TAN,MIMI PEARL FOSTER,15242
7,DACHSHUND MIX,WHITE/BROWN,LEROI,15243
8,SHETLAND SHEEPDOG,WHITE/BROWN,ZOE VITSAS,15244
9,LABRADOR RETRIEVER,YELLOW,TAFFY,15245


In [250]:
#indexes are those that are defined as 1, 2, 3, ..., 17 in the very frist column
my_df = my_df.head()
my_df

Unnamed: 0,Breed,Color,DogName,OwnerZip
0,COCKAPOO,BROWN,CHARLEY,15236
1,GER SHEPHERD,BLACK/BROWN,TACODA,15237
2,BELG MALINOIS,BRINDLE,EICH,15238
3,MIXED,BLACK/BROWN,ARROW,15239
4,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240


In [251]:
#create new column
my_df["FrameHeader"] = ["Dog 1", "Dog 2", "Dog 3", "Dog 4", "Dog 5"]
my_df

Unnamed: 0,Breed,Color,DogName,OwnerZip,FrameHeader
0,COCKAPOO,BROWN,CHARLEY,15236,Dog 1
1,GER SHEPHERD,BLACK/BROWN,TACODA,15237,Dog 2
2,BELG MALINOIS,BRINDLE,EICH,15238,Dog 3
3,MIXED,BLACK/BROWN,ARROW,15239,Dog 4
4,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240,Dog 5


In [252]:
#set index
my_df.set_index("FrameHeader", inplace = True) #inplace - makes the change permanent

In [253]:
my_df

Unnamed: 0_level_0,Breed,Color,DogName,OwnerZip
FrameHeader,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dog 1,COCKAPOO,BROWN,CHARLEY,15236
Dog 2,GER SHEPHERD,BLACK/BROWN,TACODA,15237
Dog 3,BELG MALINOIS,BRINDLE,EICH,15238
Dog 4,MIXED,BLACK/BROWN,ARROW,15239
Dog 5,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240


In [254]:
#reset index
my_df.reset_index(inplace = True) #inplace makes the changes permanent

In [255]:
my_df

Unnamed: 0,FrameHeader,Breed,Color,DogName,OwnerZip
0,Dog 1,COCKAPOO,BROWN,CHARLEY,15236
1,Dog 2,GER SHEPHERD,BLACK/BROWN,TACODA,15237
2,Dog 3,BELG MALINOIS,BRINDLE,EICH,15238
3,Dog 4,MIXED,BLACK/BROWN,ARROW,15239
4,Dog 5,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240


In [256]:
#drop column
my_df.drop("FrameHeader", axis = 1, inplace = True)

In [257]:
my_df

Unnamed: 0,Breed,Color,DogName,OwnerZip
0,COCKAPOO,BROWN,CHARLEY,15236
1,GER SHEPHERD,BLACK/BROWN,TACODA,15237
2,BELG MALINOIS,BRINDLE,EICH,15238
3,MIXED,BLACK/BROWN,ARROW,15239
4,AM PIT BULL TERRIER,WHITE/BROWN,OAKLEY,15240


# **How to fix incomplete data**

In [258]:
#create dummy data
stuff = {'A': [1, 2, 3], 'B': [4, np.nan, np.nan], 'C': [7, 8, 9], 'D': [10, 11, 12]} #np.nan - means basically no data
my_df = pd.DataFrame(stuff)
my_df

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,,8,11
2,3,,9,12


In [259]:
#drop rows with null data
my_df.dropna()

Unnamed: 0,A,B,C,D
0,1,4.0,7,10


In [260]:
#drop columns with null data
my_df.dropna(axis = 1)

Unnamed: 0,A,C,D
0,1,7,10
1,2,8,11
2,3,9,12


In [261]:
#not permanent, inplace = False
my_df

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,,8,11
2,3,,9,12


In [262]:
#more than one? set threshold
my_df.dropna(thresh = 2, axis = 1) #delete that one column that has 2 NaN's in it

Unnamed: 0,A,C,D
0,1,7,10
1,2,8,11
2,3,9,12


In [263]:
#replace thins with fillna()
my_df.fillna(value = "Bob") #gonna change NaN values into Bob

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,Bob,8,11
2,3,Bob,9,12


In [264]:
#use math functions
my_df.fillna(value = my_df['B'].mean()) #takes the column B's mean value

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,4.0,8,11
2,3,4.0,9,12


In [265]:
#use min max
my_df.fillna(value = my_df['B'].min()) #takes the column B's min value

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,4.0,8,11
2,3,4.0,9,12


In [266]:
my_df.fillna(value = my_df['B'].max())

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,4.0,8,11
2,3,4.0,9,12


In [267]:
#sum
my_df.fillna(value = my_df['B'].sum())

Unnamed: 0,A,B,C,D
0,1,4.0,7,10
1,2,4.0,8,11
2,3,4.0,9,12


# **DataFrame Group BY**

In [268]:
#create data
stuff = {
    'Corporation':['Apple', 'Google', 'Meta', 'Apple', 'Google', 'Meta'],
    'Employees':['John', 'April', 'Wes', 'Beth', 'Justin', 'Steph'],
    'Salary':[200, 220, 190, 130, 120, 150]}
#create dataframe
my_df = pd.DataFrame(stuff)
my_df

Unnamed: 0,Corporation,Employees,Salary
0,Apple,John,200
1,Google,April,220
2,Meta,Wes,190
3,Apple,Beth,130
4,Google,Justin,120
5,Meta,Steph,150


In [269]:
#group by Corporation - to get object location in memory
company = my_df.groupby('Corporation') #basically an address of the memory
company

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5bb5f0e610>

In [270]:
#sum
company.sum()

Unnamed: 0_level_0,Employees,Salary
Corporation,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,JohnBeth,330
Google,AprilJustin,340
Meta,WesSteph,340


In [271]:
#mean
company.mean(numeric_only = True)

Unnamed: 0_level_0,Salary
Corporation,Unnamed: 1_level_1
Apple,165.0
Google,170.0
Meta,170.0


In [272]:
#max
company.max()

Unnamed: 0_level_0,Employees,Salary
Corporation,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,John,200
Google,Justin,220
Meta,Wes,190


In [273]:
#min
company.min()

Unnamed: 0_level_0,Employees,Salary
Corporation,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,Beth,130
Google,April,120
Meta,Steph,150


In [274]:
#standard deviation
company.std(numeric_only = True)

Unnamed: 0_level_0,Salary
Corporation,Unnamed: 1_level_1
Apple,49.497475
Google,70.710678
Meta,28.284271


In [275]:
#variance
company.var(numeric_only = True)

Unnamed: 0_level_0,Salary
Corporation,Unnamed: 1_level_1
Apple,2450.0
Google,5000.0
Meta,800.0


In [276]:
#count
company.count()

Unnamed: 0_level_0,Employees,Salary
Corporation,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,2,2
Google,2,2
Meta,2,2


In [277]:
#describe
company.describe()

Unnamed: 0_level_0,Salary,Salary,Salary,Salary,Salary,Salary,Salary,Salary
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Corporation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Apple,2.0,165.0,49.497475,130.0,147.5,165.0,182.5,200.0
Google,2.0,170.0,70.710678,120.0,145.0,170.0,195.0,220.0
Meta,2.0,170.0,28.284271,150.0,160.0,170.0,180.0,190.0


# **How to count unique data in pandas**

In [278]:
#the data we gonna be working with
content = {
    'Breed': ['SMT', 'AMT', 'PZM', 'ROP', 'PZM', 'SMT', 'SMT', 'XOL', 'SMT'],
    'Color': ['GREEN', 'BROWN', 'GRAY', 'RED', 'WHITE', 'GRAY', 'GREEN', 'RED', 'PURPLE'],
    'DogName': ['CHARLEY', 'OMNIOUS', 'SMOP', 'CHARLEY', 'OIKAK', 'CHARLEY', 'SMOP', 'EDIBLE', 'OIKAK'],
    'OwnerZip': [12556,15236,15485,15269,15236,14857,18596,17589,13652]}
df = pd.DataFrame(content)
df

Unnamed: 0,Breed,Color,DogName,OwnerZip
0,SMT,GREEN,CHARLEY,12556
1,AMT,BROWN,OMNIOUS,15236
2,PZM,GRAY,SMOP,15485
3,ROP,RED,CHARLEY,15269
4,PZM,WHITE,OIKAK,15236
5,SMT,GRAY,CHARLEY,14857
6,SMT,GREEN,SMOP,18596
7,XOL,RED,EDIBLE,17589
8,SMT,PURPLE,OIKAK,13652


In [279]:
#grab a specific column
df["DogName"]

Unnamed: 0,DogName
0,CHARLEY
1,OMNIOUS
2,SMOP
3,CHARLEY
4,OIKAK
5,CHARLEY
6,SMOP
7,EDIBLE
8,OIKAK


In [280]:
#count values
df["DogName"].value_counts()
#pass as dataframe
pd.DataFrame(df["DogName"].value_counts())

Unnamed: 0_level_0,count
DogName,Unnamed: 1_level_1
CHARLEY,3
SMOP,2
OIKAK,2
OMNIOUS,1
EDIBLE,1


In [281]:
#grab uniques
df['DogName'].unique()

array(['CHARLEY', 'OMNIOUS', 'SMOP', 'OIKAK', 'EDIBLE'], dtype=object)

In [282]:
#show as dataframe
pd.DataFrame(df['DogName'].unique())

Unnamed: 0,0
0,CHARLEY
1,OMNIOUS
2,SMOP
3,OIKAK
4,EDIBLE


# **How to apply functions to dataframes**

In [283]:
#the dataframe we gonna be working with:
my_df

Unnamed: 0,Corporation,Employees,Salary
0,Apple,John,200
1,Google,April,220
2,Meta,Wes,190
3,Apple,Beth,130
4,Google,Justin,120
5,Meta,Steph,150


In [284]:
#create a function
def times1000(x):
  return format(x * 1000, ',d')

In [285]:
#use apply function
my_df['Salary'].apply(times1000)

Unnamed: 0,Salary
0,200000
1,220000
2,190000
3,130000
4,120000
5,150000


In [286]:
#make it into a dataframe
pd.DataFrame(my_df['Salary'].apply(times1000))

Unnamed: 0,Salary
0,200000
1,220000
2,190000
3,130000
4,120000
5,150000


In [287]:
#add elder last name
def namer(x):
  if x == "John":
    return 'John Elder'
  else:
    return "Not Important"

In [288]:
#show elder last name
my_df['Employees'].apply(namer)

Unnamed: 0,Employees
0,John Elder
1,Not Important
2,Not Important
3,Not Important
4,Not Important
5,Not Important


In [289]:
#show it in the DataFrame
pd.DataFrame(my_df['Employees'].apply(namer))

Unnamed: 0,Employees
0,John Elder
1,Not Important
2,Not Important
3,Not Important
4,Not Important
5,Not Important


In [290]:
#use lambda
my_df['Salary'].apply(lambda x: x * 1000)
#or you can do the formatting as well like
#my_df['Salary'].apply(lambda x: format(x * 1000), ',d')

Unnamed: 0,Salary
0,200000
1,220000
2,190000
3,130000
4,120000
5,150000


# **Applying functions to multiple columns**

In [291]:
#the data
my_df

Unnamed: 0,Corporation,Employees,Salary
0,Apple,John,200
1,Google,April,220
2,Meta,Wes,190
3,Apple,Beth,130
4,Google,Justin,120
5,Meta,Steph,150


In [292]:
#create function
def times1000(x):
  return format(x * 1000, ',d')

In [293]:
#use apply function
my_df['Salary'].apply(times1000)

Unnamed: 0,Salary
0,200000
1,220000
2,190000
3,130000
4,120000
5,150000


In [294]:
#append to current and original dataframe
my_df['Salary'] = my_df['Salary'].apply(times1000)
my_df

Unnamed: 0,Corporation,Employees,Salary
0,Apple,John,200000
1,Google,April,220000
2,Meta,Wes,190000
3,Apple,Beth,130000
4,Google,Justin,120000
5,Meta,Steph,150000


In [295]:
#function on multiple columns
def namer(x):
  return "SMT: " + x

In [296]:
#apply to two columns
my_df[['Corporation', 'Employees']].apply(namer)

Unnamed: 0,Corporation,Employees
0,SMT: Apple,SMT: John
1,SMT: Google,SMT: April
2,SMT: Meta,SMT: Wes
3,SMT: Apple,SMT: Beth
4,SMT: Google,SMT: Justin
5,SMT: Meta,SMT: Steph


In [297]:
#append to current and original dataframe
my_df[['Corporation', 'Employees']] = my_df[['Corporation', 'Employees']].apply(namer)
my_df

Unnamed: 0,Corporation,Employees,Salary
0,SMT: Apple,SMT: John,200000
1,SMT: Google,SMT: April,220000
2,SMT: Meta,SMT: Wes,190000
3,SMT: Apple,SMT: Beth,130000
4,SMT: Google,SMT: Justin,120000
5,SMT: Meta,SMT: Steph,150000


# **Sorting and Ordering data**

In [298]:
#the data
my_df

Unnamed: 0,Corporation,Employees,Salary
0,SMT: Apple,SMT: John,200000
1,SMT: Google,SMT: April,220000
2,SMT: Meta,SMT: Wes,190000
3,SMT: Apple,SMT: Beth,130000
4,SMT: Google,SMT: Justin,120000
5,SMT: Meta,SMT: Steph,150000


In [300]:
#sort salaries from lowest to highest
my_df.sort_values('Salary')

Unnamed: 0,Corporation,Employees,Salary
4,SMT: Google,SMT: Justin,120000
3,SMT: Apple,SMT: Beth,130000
5,SMT: Meta,SMT: Steph,150000
2,SMT: Meta,SMT: Wes,190000
0,SMT: Apple,SMT: John,200000
1,SMT: Google,SMT: April,220000


In [304]:
#sort salaries from highest to lowest
my_df.sort_values('Salary', ascending = False)

Unnamed: 0,Corporation,Employees,Salary
1,SMT: Google,SMT: April,220000
0,SMT: Apple,SMT: John,200000
2,SMT: Meta,SMT: Wes,190000
5,SMT: Meta,SMT: Steph,150000
3,SMT: Apple,SMT: Beth,130000
4,SMT: Google,SMT: Justin,120000


In [305]:
#not permanent without inplace = True
my_df

Unnamed: 0,Corporation,Employees,Salary
0,SMT: Apple,SMT: John,200000
1,SMT: Google,SMT: April,220000
2,SMT: Meta,SMT: Wes,190000
3,SMT: Apple,SMT: Beth,130000
4,SMT: Google,SMT: Justin,120000
5,SMT: Meta,SMT: Steph,150000


In [307]:
#inplace
my_df.sort_values('Salary', inplace = True)
my_df

Unnamed: 0,Corporation,Employees,Salary
4,SMT: Google,SMT: Justin,120000
3,SMT: Apple,SMT: Beth,130000
5,SMT: Meta,SMT: Steph,150000
2,SMT: Meta,SMT: Wes,190000
0,SMT: Apple,SMT: John,200000
1,SMT: Google,SMT: April,220000
