In [83]:
import pandas as pd
import numpy as np

###  DataFrame: 
* In Python, the Pandas library provides a data structure called a DataFrame, which is a two-dimensional, tabular data structure with labeled axes (rows and columns).
* It is similar to a spreadsheet or a SQL table, and it is a powerful tool for data manipulation and analysis.

In [84]:
x=pd.DataFrame([pd.date_range('14-10-2023','23-10-2023'),
                         np.random.randint(-100,-1,10),
                         np.random.randint(1,100,10),
                         [chr(ele)for ele in np.random.randint(97,123,10)],
                         [chr(ele)for ele in np.random.randint(65,91,10)]])
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,2023-10-14 00:00:00,2023-10-15 00:00:00,2023-10-16 00:00:00,2023-10-17 00:00:00,2023-10-18 00:00:00,2023-10-19 00:00:00,2023-10-20 00:00:00,2023-10-21 00:00:00,2023-10-22 00:00:00,2023-10-23 00:00:00
1,-20,-21,-83,-100,-55,-42,-7,-90,-76,-55
2,45,5,43,88,3,74,63,72,3,46
3,d,a,o,g,r,d,x,c,b,w
4,A,I,N,H,S,A,A,L,N,S


### Transpose:
* the transpose of a DataFrame can be obtained using the .T attribute. 
* The transpose operation switches the rows and columns of the DataFrame.

In [85]:
x.T

Unnamed: 0,0,1,2,3,4
0,2023-10-14,-20,45,d,A
1,2023-10-15,-21,5,a,I
2,2023-10-16,-83,43,o,N
3,2023-10-17,-100,88,g,H
4,2023-10-18,-55,3,r,S
5,2023-10-19,-42,74,d,A
6,2023-10-20,-7,63,x,A
7,2023-10-21,-90,72,c,L
8,2023-10-22,-76,3,b,N
9,2023-10-23,-55,46,w,S


#### copy():
* The copy() method returns a copy of the DataFrame.
* By default, the copy is a "deep copy" meaning that any changes made in the original DataFrame will NOT be reflected in the copy.

In [86]:
X = x.T.copy()
X

Unnamed: 0,0,1,2,3,4
0,2023-10-14,-20,45,d,A
1,2023-10-15,-21,5,a,I
2,2023-10-16,-83,43,o,N
3,2023-10-17,-100,88,g,H
4,2023-10-18,-55,3,r,S
5,2023-10-19,-42,74,d,A
6,2023-10-20,-7,63,x,A
7,2023-10-21,-90,72,c,L
8,2023-10-22,-76,3,b,N
9,2023-10-23,-55,46,w,S


#### Column Labels: 
* Columns in a DataFrame have names or labels. You can access columns using these labels.

In [87]:
X.columns = ['col_'+str(ele) for ele in X.columns]
X

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,2023-10-14,-20,45,d,A
1,2023-10-15,-21,5,a,I
2,2023-10-16,-83,43,o,N
3,2023-10-17,-100,88,g,H
4,2023-10-18,-55,3,r,S
5,2023-10-19,-42,74,d,A
6,2023-10-20,-7,63,x,A
7,2023-10-21,-90,72,c,L
8,2023-10-22,-76,3,b,N
9,2023-10-23,-55,46,w,S


#### Row Labels (Index): 
* Rows in a DataFrame are identified by an index. By default, Pandas assigns a numeric index starting from 0. 
* You can also set a specific column as the index or create a custom index.

In [88]:
X.index = ['row_'+str(ele) for ele in X.index]
X

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
row_0,2023-10-14,-20,45,d,A
row_1,2023-10-15,-21,5,a,I
row_2,2023-10-16,-83,43,o,N
row_3,2023-10-17,-100,88,g,H
row_4,2023-10-18,-55,3,r,S
row_5,2023-10-19,-42,74,d,A
row_6,2023-10-20,-7,63,x,A
row_7,2023-10-21,-90,72,c,L
row_8,2023-10-22,-76,3,b,N
row_9,2023-10-23,-55,46,w,S


#### rename:
* In Pandas, you can use the rename method to rename rows or columns in a DataFrame.

In [89]:
X.rename(columns = {'col_0':'date'},
        index = {'row_9':'ninthrow'},
        inplace = True) # The inplace=True argument modifies the DataFrame in place.
X

Unnamed: 0,date,col_1,col_2,col_3,col_4
row_0,2023-10-14,-20,45,d,A
row_1,2023-10-15,-21,5,a,I
row_2,2023-10-16,-83,43,o,N
row_3,2023-10-17,-100,88,g,H
row_4,2023-10-18,-55,3,r,S
row_5,2023-10-19,-42,74,d,A
row_6,2023-10-20,-7,63,x,A
row_7,2023-10-21,-90,72,c,L
row_8,2023-10-22,-76,3,b,N
ninthrow,2023-10-23,-55,46,w,S


#### Custom Index:
* we can set a custom index for a DataFrame using the set_index method

In [90]:
X.set_index('date')

Unnamed: 0_level_0,col_1,col_2,col_3,col_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-14,-20,45,d,A
2023-10-15,-21,5,a,I
2023-10-16,-83,43,o,N
2023-10-17,-100,88,g,H
2023-10-18,-55,3,r,S
2023-10-19,-42,74,d,A
2023-10-20,-7,63,x,A
2023-10-21,-90,72,c,L
2023-10-22,-76,3,b,N
2023-10-23,-55,46,w,S


#### Resetting the Index:
* You can reset the index to the default integer-based index using the reset_index method

In [91]:
X.reset_index(drop = True, inplace = True)
X

Unnamed: 0,date,col_1,col_2,col_3,col_4
0,2023-10-14,-20,45,d,A
1,2023-10-15,-21,5,a,I
2,2023-10-16,-83,43,o,N
3,2023-10-17,-100,88,g,H
4,2023-10-18,-55,3,r,S
5,2023-10-19,-42,74,d,A
6,2023-10-20,-7,63,x,A
7,2023-10-21,-90,72,c,L
8,2023-10-22,-76,3,b,N
9,2023-10-23,-55,46,w,S


column dropping:
* To drop specific columns, you can use the drop method with the columns parameter

In [92]:
X.set_index('date',drop = False)

Unnamed: 0_level_0,date,col_1,col_2,col_3,col_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-14,2023-10-14,-20,45,d,A
2023-10-15,2023-10-15,-21,5,a,I
2023-10-16,2023-10-16,-83,43,o,N
2023-10-17,2023-10-17,-100,88,g,H
2023-10-18,2023-10-18,-55,3,r,S
2023-10-19,2023-10-19,-42,74,d,A
2023-10-20,2023-10-20,-7,63,x,A
2023-10-21,2023-10-21,-90,72,c,L
2023-10-22,2023-10-22,-76,3,b,N
2023-10-23,2023-10-23,-55,46,w,S


#### drop:
* The drop method is used to remove the existing index, and reset_index(drop=True) essentially resets the index to the default integer-based index. The drop=True parameter prevents the old index from being added as a new column.
#### inplace:
* Remember that using inplace=True modifies the original DataFrame. If you prefer not to modify the original DataFrame and instead create a new one with the changes, you can omit inplace=True and assign the result to a new variable

In [93]:
X.reset_index(drop= True, inplace = True)
X

Unnamed: 0,date,col_1,col_2,col_3,col_4
0,2023-10-14,-20,45,d,A
1,2023-10-15,-21,5,a,I
2,2023-10-16,-83,43,o,N
3,2023-10-17,-100,88,g,H
4,2023-10-18,-55,3,r,S
5,2023-10-19,-42,74,d,A
6,2023-10-20,-7,63,x,A
7,2023-10-21,-90,72,c,L
8,2023-10-22,-76,3,b,N
9,2023-10-23,-55,46,w,S


#### normal:
* is a function that generates random samples from a normal (Gaussian) distribution. The normal distribution is characterized by its mean and standard deviation.
* loc: The mean (center) of the normal distribution.
* scale: The standard deviation (spread or width) of the distribution.
* size: The number of random samples to generate. If size is None (the default), a single random value is generated. If size is an integer or tuple of integers, it specifies the shape of the output.


In [94]:
X['col_5'] = np.random.normal(0,1,10)
X

Unnamed: 0,date,col_1,col_2,col_3,col_4,col_5
0,2023-10-14,-20,45,d,A,0.526006
1,2023-10-15,-21,5,a,I,0.004013
2,2023-10-16,-83,43,o,N,-1.541072
3,2023-10-17,-100,88,g,H,-0.895096
4,2023-10-18,-55,3,r,S,0.946994
5,2023-10-19,-42,74,d,A,1.242323
6,2023-10-20,-7,63,x,A,1.040327
7,2023-10-21,-90,72,c,L,1.283695
8,2023-10-22,-76,3,b,N,-0.907585
9,2023-10-23,-55,46,w,S,0.546043


#### insert:
* To insert a new column into a Pandas DataFrame, you can use the insert method. The insert method allows you to specify the position of the new column within the DataFrame.

In [95]:
X.insert( loc = 6, column = 'col_6', value = np.NAN)
X

Unnamed: 0,date,col_1,col_2,col_3,col_4,col_5,col_6
0,2023-10-14,-20,45,d,A,0.526006,
1,2023-10-15,-21,5,a,I,0.004013,
2,2023-10-16,-83,43,o,N,-1.541072,
3,2023-10-17,-100,88,g,H,-0.895096,
4,2023-10-18,-55,3,r,S,0.946994,
5,2023-10-19,-42,74,d,A,1.242323,
6,2023-10-20,-7,63,x,A,1.040327,
7,2023-10-21,-90,72,c,L,1.283695,
8,2023-10-22,-76,3,b,N,-0.907585,
9,2023-10-23,-55,46,w,S,0.546043,


In [96]:
X.insert( loc = 0, column = 'subZero', value = np.NAN) # inserting column subzeros
X

Unnamed: 0,subZero,date,col_1,col_2,col_3,col_4,col_5,col_6
0,,2023-10-14,-20,45,d,A,0.526006,
1,,2023-10-15,-21,5,a,I,0.004013,
2,,2023-10-16,-83,43,o,N,-1.541072,
3,,2023-10-17,-100,88,g,H,-0.895096,
4,,2023-10-18,-55,3,r,S,0.946994,
5,,2023-10-19,-42,74,d,A,1.242323,
6,,2023-10-20,-7,63,x,A,1.040327,
7,,2023-10-21,-90,72,c,L,1.283695,
8,,2023-10-22,-76,3,b,N,-0.907585,
9,,2023-10-23,-55,46,w,S,0.546043,


In [97]:
np.random.seed(23)
z = pd.DataFrame({'col1':np.random.randint(-1000,1000,1000),
                 'col2':np.random.randint(-100,100,1000),
                 'col3':np.random.normal(0,1,1000)})
z

Unnamed: 0,col1,col2,col3
0,-405,70,-0.308605
1,-258,65,-0.861110
2,64,4,0.493138
3,993,-52,-0.027219
4,-50,51,-0.730339
...,...,...,...
995,-699,46,-0.654231
996,-133,-65,-1.221007
997,853,-45,0.206911
998,199,38,-1.494436


#### head:
* The head method in Pandas is used to display the first n rows of a DataFrame. By default, head() displays the first 5 rows. 
* You can specify the number of rows you want to display by passing an argument to the head method.

In [98]:
z.head()

Unnamed: 0,col1,col2,col3
0,-405,70,-0.308605
1,-258,65,-0.86111
2,64,4,0.493138
3,993,-52,-0.027219
4,-50,51,-0.730339


#### tail:
* The tail method in Pandas is used to display the last n rows of a DataFrame. By default, tail() displays the last 5 rows.
* You can specify the number of rows you want to display by passing an argument to the tail method.

In [99]:
z.tail()

Unnamed: 0,col1,col2,col3
995,-699,46,-0.654231
996,-133,-65,-1.221007
997,853,-45,0.206911
998,199,38,-1.494436
999,-364,-33,-0.964724


#### sample:
* The sample method in Pandas is used to randomly sample rows from a DataFrame. 
* This can be useful for selecting a random subset of your data for analysis or testing. 
* You can specify the number of rows you want to sample, and you can also set a random seed for reproducibility.

In [100]:
z.sample()

Unnamed: 0,col1,col2,col3
774,922,-14,-0.380026


In [101]:
z.sample(10) # randomly we can access the sample values

Unnamed: 0,col1,col2,col3
208,-851,84,-1.68677
438,700,46,-0.705908
188,922,77,-0.434953
128,-777,-84,-0.300938
827,588,71,0.550091
218,616,61,-0.335848
667,35,12,-0.365017
600,932,-31,-0.375552
332,-536,86,-0.369553
360,-280,14,-0.879301


#### describe:
* The describe method in Pandas is used to generate descriptive statistics of a DataFrame. 
* It provides a summary of central tendency, dispersion, and shape of the distribution of a dataset's values.

In [102]:
help(pd.DataFrame.describe)

Help on function describe in module pandas.core.generic:

describe(self: 'NDFrameT', percentiles=None, include=None, exclude=None) -> 'NDFrameT'
    Generate descriptive statistics.
    
    Descriptive statistics include those that summarize the central
    tendency, dispersion and shape of a
    dataset's distribution, excluding ``NaN`` values.
    
    Analyzes both numeric and object series, as well
    as ``DataFrame`` column sets of mixed data types. The output
    will vary depending on what is provided. Refer to the notes
    below for more detail.
    
    Parameters
    ----------
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should
        fall between 0 and 1. The default is
        ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored
        for ``Series`

In [103]:
z.describe()

Unnamed: 0,col1,col2,col3
count,1000.0,1000.0,1000.0
mean,-17.5,-0.238,-0.019928
std,587.412197,57.664153,1.015113
min,-996.0,-100.0,-3.129831
25%,-533.0,-51.0,-0.715867
50%,-21.0,2.0,-0.012334
75%,500.25,48.0,0.675573
max,998.0,99.0,3.2774


In [104]:
z.describe().T # transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col1,1000.0,-17.5,587.412197,-996.0,-533.0,-21.0,500.25,998.0
col2,1000.0,-0.238,57.664153,-100.0,-51.0,2.0,48.0,99.0
col3,1000.0,-0.019928,1.015113,-3.129831,-0.715867,-0.012334,0.675573,3.2774


#### percentile:
* In Pandas, you can calculate percentiles using the percentile method of a DataFrame or a Series. 
* The percentile method allows you to specify the desired percentiles, and it returns the corresponding values.

In [105]:
z.describe(percentiles = [ele/100 for ele in range(1,10)]).T # The percentile method was introduced in Pandas version 1.1.0. If you are using an older version, you can use the quantile method to achieve a similar result

Unnamed: 0,count,mean,std,min,1%,2%,3%,4%,5%,6%,7%,8%,9%,50%,max
col1,1000.0,-17.5,587.412197,-996.0,-988.01,-965.06,-957.0,-944.0,-927.0,-889.06,-867.07,-838.16,-827.09,-21.0,998.0
col2,1000.0,-0.238,57.664153,-100.0,-99.0,-97.0,-94.0,-93.0,-90.0,-87.06,-86.0,-85.08,-83.0,2.0,99.0
col3,1000.0,-0.019928,1.015113,-3.129831,-2.380267,-2.083956,-1.953019,-1.849965,-1.676092,-1.578121,-1.500112,-1.462351,-1.411913,-0.012334,3.2774


In [106]:
x = pd.DataFrame([pd.date_range('14-10-2023','23-10-2023'),
                         np.random.randint(-100,-1,10),
                         np.random.randint(1,100,10),
                         [chr(ele)for ele in np.random.randint(97,123,10)],
                         [chr(ele)for ele in np.random.randint(65,91,10)]])
x.T

Unnamed: 0,0,1,2,3,4
0,2023-10-14,-81,35,s,G
1,2023-10-15,-73,45,t,O
2,2023-10-16,-97,93,f,Q
3,2023-10-17,-74,1,f,D
4,2023-10-18,-2,44,r,A
5,2023-10-19,-53,76,t,N
6,2023-10-20,-24,62,y,F
7,2023-10-21,-85,63,o,G
8,2023-10-22,-65,91,n,L
9,2023-10-23,-82,12,v,D


#### info:
The info method in Pandas is used to print a concise summary of a DataFrame, including information about the data types, non-null values, and memory usage. It provides a quick overview of the structure of the DataFrame.

In [107]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5 non-null      object
 1   1       5 non-null      object
 2   2       5 non-null      object
 3   3       5 non-null      object
 4   4       5 non-null      object
 5   5       5 non-null      object
 6   6       5 non-null      object
 7   7       5 non-null      object
 8   8       5 non-null      object
 9   9       5 non-null      object
dtypes: object(10)
memory usage: 532.0+ bytes


In [109]:
x.describe().T

Unnamed: 0,count,unique,top,freq
0,5,5,2023-10-14,1
1,5,5,2023-10-15,1
2,5,5,2023-10-16,1
3,5,5,2023-10-17,1
4,5,5,2023-10-18,1
5,5,5,2023-10-19,1
6,5,5,2023-10-20,1
7,5,5,2023-10-21,1
8,5,5,2023-10-22,1
9,5,5,2023-10-23,1


In [110]:
x['col_5'] = 10
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,col_5
0,2023-10-14 00:00:00,2023-10-15 00:00:00,2023-10-16 00:00:00,2023-10-17 00:00:00,2023-10-18 00:00:00,2023-10-19 00:00:00,2023-10-20 00:00:00,2023-10-21 00:00:00,2023-10-22 00:00:00,2023-10-23 00:00:00,10
1,-81,-73,-97,-74,-2,-53,-24,-85,-65,-82,10
2,35,45,93,1,44,76,62,63,91,12,10
3,s,t,f,f,r,t,y,o,n,v,10
4,G,O,Q,D,A,N,F,G,L,D,10


In [111]:
x.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col_5,5.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0


#### include:
* describe method in Pandas, you can use include='all'. This includes both numeric and object (non-numeric) columns.

In [112]:
x.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,5.0,5.0,2023-10-14,1.0,,,,,,,
1,5.0,5.0,2023-10-15,1.0,,,,,,,
2,5.0,5.0,2023-10-16,1.0,,,,,,,
3,5.0,5.0,2023-10-17,1.0,,,,,,,
4,5.0,5.0,2023-10-18,1.0,,,,,,,
5,5.0,5.0,2023-10-19,1.0,,,,,,,
6,5.0,5.0,2023-10-20,1.0,,,,,,,
7,5.0,5.0,2023-10-21,1.0,,,,,,,
8,5.0,5.0,2023-10-22,1.0,,,,,,,
9,5.0,5.0,2023-10-23,1.0,,,,,,,


In [113]:
x.describe(include = ['object','int64']).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,5.0,5.0,2023-10-14,1.0,,,,,,,
1,5.0,5.0,2023-10-15,1.0,,,,,,,
2,5.0,5.0,2023-10-16,1.0,,,,,,,
3,5.0,5.0,2023-10-17,1.0,,,,,,,
4,5.0,5.0,2023-10-18,1.0,,,,,,,
5,5.0,5.0,2023-10-19,1.0,,,,,,,
6,5.0,5.0,2023-10-20,1.0,,,,,,,
7,5.0,5.0,2023-10-21,1.0,,,,,,,
8,5.0,5.0,2023-10-22,1.0,,,,,,,
9,5.0,5.0,2023-10-23,1.0,,,,,,,


In [114]:
x['col_6'] = 10
x.T

Unnamed: 0,0,1,2,3,4
0,2023-10-14 00:00:00,-81,35,s,G
1,2023-10-15 00:00:00,-73,45,t,O
2,2023-10-16 00:00:00,-97,93,f,Q
3,2023-10-17 00:00:00,-74,1,f,D
4,2023-10-18 00:00:00,-2,44,r,A
5,2023-10-19 00:00:00,-53,76,t,N
6,2023-10-20 00:00:00,-24,62,y,F
7,2023-10-21 00:00:00,-85,63,o,G
8,2023-10-22 00:00:00,-65,91,n,L
9,2023-10-23 00:00:00,-82,12,v,D


#### exclude:
* describe method, you can use the exclude parameter to exclude specific data types from the summary statistics. 
* This parameter allows you to focus on specific types of columns and exclude others

In [115]:
x.describe(exclude = 'float64').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,5.0,5.0,2023-10-14,1.0,,,,,,,
1,5.0,5.0,2023-10-15,1.0,,,,,,,
2,5.0,5.0,2023-10-16,1.0,,,,,,,
3,5.0,5.0,2023-10-17,1.0,,,,,,,
4,5.0,5.0,2023-10-18,1.0,,,,,,,
5,5.0,5.0,2023-10-19,1.0,,,,,,,
6,5.0,5.0,2023-10-20,1.0,,,,,,,
7,5.0,5.0,2023-10-21,1.0,,,,,,,
8,5.0,5.0,2023-10-22,1.0,,,,,,,
9,5.0,5.0,2023-10-23,1.0,,,,,,,


In [118]:
x = pd.DataFrame([np.random.randint(-100,-1,10),
                  np.random.randint(1,100,10)])
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-98,-16,-15,-65,-58,-8,-58,-98,-59,-91
1,64,29,49,10,41,74,19,73,84,3


In [120]:
x.T

Unnamed: 0,0,1
0,-98,64
1,-16,29
2,-15,49
3,-65,10
4,-58,41
5,-8,74
6,-58,19
7,-98,73
8,-59,84
9,-91,3


In [121]:
x.copy

<bound method NDFrame.copy of     0   1   2   3   4   5   6   7   8   9
0 -98 -16 -15 -65 -58  -8 -58 -98 -59 -91
1  64  29  49  10  41  74  19  73  84   3>

In [122]:
x[1]

0   -16
1    29
Name: 1, dtype: int32

#### indexing:
*Indexing in Pandas refers to the process of selecting a subset of data from a DataFrame or a Series. 

In [126]:
x[1],x[2] = x[1].astype('object'),x[2].astype('int64')

In [127]:
x[1],x[2]

(0    -16
 1    -16
 Name: 1, dtype: object,
 0    29
 1    29
 Name: 2, dtype: int64)