# Python Basic Dataframe

[Lecture](https://www.youtube.com/watch?v=rku5rZxS0AA&list=PLG19vXLQHvSAufDFgZEFAYQEwMJXklnQV&index=1)

In [2]:
import pandas as pd               # dataframe 
import numpy as np                # array and matrix math
import os                         # operation system
import matplotlib.pyplot as plt   # plotting

**Select working directory**

In [10]:
os.chdir('GeoDataSets/')

In [11]:
df = pd.read_csv('https://raw.githubusercontent.com/GeostatsGuy/GeoDataSets/master/2D_MV_200wells.csv')

In [12]:
df.head()

Unnamed: 0,X,Y,facies_threshold_0.3,porosity,permeability,acoustic_impedance
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
3,3575,2655,1,0.1621,9.048,2.157
4,1835,35,1,0.1766,7.123,3.979


## Statistics

In [16]:
df.describe()

Unnamed: 0,X,Y,facies_threshold_0.3,porosity,permeability,acoustic_impedance
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,2053.4,1876.15,1.33,0.1493,25.287462,3.000435
std,1113.524641,1137.58016,0.471393,0.032948,64.470135,0.592201
min,25.0,35.0,1.0,0.05,0.01582,2.009
25%,1112.5,920.0,1.0,0.132175,1.36675,2.48325
50%,2160.0,1855.0,1.0,0.15015,4.8255,2.9645
75%,2915.0,2782.5,2.0,0.1742,14.597,3.527
max,3955.0,3995.0,2.0,0.2232,463.641,3.984


In [15]:
df.describe(percentiles=[0.1,0.9]) # percentiles = [lower quatile, upper quatile]

Unnamed: 0,X,Y,facies_threshold_0.3,porosity,permeability,acoustic_impedance
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,2053.4,1876.15,1.33,0.1493,25.287462,3.000435
std,1113.524641,1137.58016,0.471393,0.032948,64.470135,0.592201
min,25.0,35.0,1.0,0.05,0.01582,2.009
10%,414.0,364.0,1.0,0.1061,0.26229,2.1915
50%,2160.0,1855.0,1.0,0.15015,4.8255,2.9645
90%,3510.0,3475.0,2.0,0.19014,56.5344,3.8336
max,3955.0,3995.0,2.0,0.2232,463.641,3.984


In [17]:
type(df.describe(percentiles=[0.1,0.9]))

pandas.core.frame.DataFrame

In [23]:
df.describe(percentiles=[0.1,0.9]).T

Unnamed: 0,count,mean,std,min,10%,50%,90%,max
X,200.0,2053.4,1113.524641,25.0,414.0,2160.0,3510.0,3955.0
Y,200.0,1876.15,1137.58016,35.0,364.0,1855.0,3475.0,3995.0
facies_threshold_0.3,200.0,1.33,0.471393,1.0,1.0,1.0,2.0,2.0
porosity,200.0,0.1493,0.032948,0.05,0.1061,0.15015,0.19014,0.2232
permeability,200.0,25.287462,64.470135,0.01582,0.26229,4.8255,56.5344,463.641
acoustic_impedance,200.0,3.000435,0.592201,2.009,2.1915,2.9645,3.8336,3.984


## Rename Columns

In [24]:
df = df.rename(columns={'facies_threshold_0.3': 'facies','permeability':'perm','acoustic_impedance':'ai'}) # rename columns of the 
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
3,3575,2655,1,0.1621,9.048,2.157
4,1835,35,1,0.1766,7.123,3.979


## Copying 
1. Shallow copy - point same memory, change one and both are changed
2. Deep copy - new copy in memory, change onely one

In [26]:
# 2. deep copy
deep_copy = df.copy(deep=True)
deep_copy.loc[4, 'ai'] = 4.0
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
3,3575,2655,1,0.1621,9.048,2.157
4,1835,35,1,0.1766,7.123,3.979


In [27]:
# 1. shallow copy
shallow_copy = df.copy(deep=False)
shallow_copy.loc[3, 'ai'] = 4.0
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
3,3575,2655,1,0.1621,9.048,4.0
4,1835,35,1,0.1766,7.123,3.979


## Add New Feature

In [30]:
zeros = np.zeros(len(df))
df['zeros'] = zeros
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai,zeros
0,565,1485,1,0.1184,6.17,2.009,0.0
1,2585,1185,1,0.1566,6.275,2.864,0.0
2,2065,2865,2,0.192,92.297,3.524,0.0
3,3575,2655,1,0.1621,9.048,4.0,0.0
4,1835,35,1,0.1766,7.123,3.979,0.0


In [32]:
df = df.drop('zeros', axis=1)
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
3,3575,2655,1,0.1621,9.048,4.0
4,1835,35,1,0.1766,7.123,3.979


## Remove a sample

In [34]:
df = df.drop(3, axis=0)      # Remove index no 3 row
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai
0,565,1485,1,0.1184,6.17,2.009
1,2585,1185,1,0.1566,6.275,2.864
2,2065,2865,2,0.192,92.297,3.524
4,1835,35,1,0.1766,7.123,3.979
5,3375,2525,1,0.1239,1.468,2.337


## Feature Engineering

In [35]:
df['porosity100'] = df['porosity']*100          # add a new column with porosity in percentage
df['permpor'] = df['perm']/df['porosity']       # add a new feature with ratio of perm / por 
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai,porosity100,permpor
0,565,1485,1,0.1184,6.17,2.009,11.84,52.111486
1,2585,1185,1,0.1566,6.275,2.864,15.66,40.070243
2,2065,2865,2,0.192,92.297,3.524,19.2,480.713542
4,1835,35,1,0.1766,7.123,3.979,17.66,40.334088
5,3375,2525,1,0.1239,1.468,2.337,12.39,11.848265


## Conditional Manipulation

In [36]:
df['tporosity'] = np.where(df['porosity']>=0.12, 'high', 'low') # make a new categorical feature
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai,porosity100,permpor,tporosity
0,565,1485,1,0.1184,6.17,2.009,11.84,52.111486,low
1,2585,1185,1,0.1566,6.275,2.864,15.66,40.070243,high
2,2065,2865,2,0.192,92.297,3.524,19.2,480.713542,high
4,1835,35,1,0.1766,7.123,3.979,17.66,40.334088,high
5,3375,2525,1,0.1239,1.468,2.337,12.39,11.848265,high


### Conditional Manipuluation more than One Feature

In [37]:
df['perm_cutoff'] = np.where(df['porosity']>=0.12, df['perm'],0.0001) # new feature with conditional truncation
df.head()

Unnamed: 0,X,Y,facies,porosity,perm,ai,porosity100,permpor,tporosity,perm_cutoff
0,565,1485,1,0.1184,6.17,2.009,11.84,52.111486,low,0.0001
1,2585,1185,1,0.1566,6.275,2.864,15.66,40.070243,high,6.275
2,2065,2865,2,0.192,92.297,3.524,19.2,480.713542,high,92.297
4,1835,35,1,0.1766,7.123,3.979,17.66,40.334088,high,7.123
5,3375,2525,1,0.1239,1.468,2.337,12.39,11.848265,high,1.468
