<a href="https://colab.research.google.com/github/jatinmeenaa/pandas_learning/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]],index=['a','b','c'], columns=['A','B','C'])
df

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


### Dataframe
A DataFrame is a 2D labeled data structure in Pandas, like a table with rows and columns.
- pd.DataFrame() : creates a dataframe with the data in form of iterables like list dictionaries arrays etc.
- by default the rows and colums have integer labels from 0.
- index=[] : customize the row labels.
- columns=[] : customize the column labels.

In [None]:
df['A'] # getting a particular column by specifying the label

In [None]:
df[['B','C']] # getting multiple columns by specifying the labels in form of list

In [23]:
df[:1:] # returns specific rows when specified in the form of [start: end(exclusive): step]
df[:1:]['A'] # getting specific columns from slice
df['A'][:1:] # make a slice from specified column

Unnamed: 0,A
a,1


In [None]:
df[2:][3:] # returns the slice of the slice

In [None]:
df.head(1) # df.head(n=5): returns first n rows, for negative returns all except for last |n|

In [None]:
df.tail(2) # df.tail(n): returns last n rows, for negative all except for first |n|

In [None]:
df.sample() # df.sample(n): returns random n rows , random_state = 2 for reproducibility

In [None]:
df.columns # return column labels
df.columns = ['a','b','c'] # changing the column labels
df.columns.tolist() # return the same in form of list

In [None]:
df.index # return row labels

In [None]:
df.info() #shows a summary of the DataFrame, including column names, non-null counts, data types, and memory usage.

In [None]:
df.describe() # gives statistical summaries (like mean, std, min, max, etc.) for numeric columns in the DataFrame.

In [None]:
df.shape # returns the shape of dataframe

In [None]:
df.size # returns the total number of elements in the DataFrame (rows × columns).

### Loading data into dataframe
- pd.read_csv('file.csv') – for CSV files
- pd.read_excel('file.xlsx') – for Excel files
- pd.read_json('file.json') – for JSON files
- pd.read_sql(query, connection) – for SQL databases

> link to raw data from github also works in place of file path



In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/jatinmeenaa/pandas_learning/refs/heads/main/Students%20Social%20Media%20Addiction.csv')
data.sample(5,random_state=2)

### Accessing the data

#### (.loc , .iloc)
1. df.loc[] is label-based: access rows/columns using labels \\
(e.g., df.loc[2], df.loc[:, 'Name'])

2. df.iloc[] is position-based: access rows/columns using index positions \\
(e.g., df.iloc[2], df.iloc[:, 0])


> The row and column specification can be a list of rows and list of columns



In [None]:
data.loc[:,'Student_ID']
data.loc[[1,2,3,6],['Student_ID','Gender']]

# data.iloc[:,'Student_ID'] # Error: value error since iloc requires index value not label
data.iloc[:,0]
data.iloc[:4,:4] # both can be a specified in form of slice

#### setting the values using .loc .iloc


In [None]:
data.loc[0,'Age']=20 #setting single value
data.loc[0:2,'Age']=20 #setting multiple values
data.loc[:,'Age']

#### (.at , .iat)
- df.at[row_label, column_label] – fast access to a single value using labels

- df.iat[row_pos, column_pos] – fast access to a single value using integer positions

In [None]:
data.at[0,'Student_ID']

#### .sort_values()
  returns the sorted data....do not change the original \\


> For changing the actual data frame set\\
 inplace = True \\
 in this case doesn't return anything



In [None]:
data.head()

In [None]:
data.sort_values('Age') # default ascending
data.sort_values(by='Age') # works the same
data.sort_values(['Gender','Age'],ascending=[0,1],inplace=True) # gender descending and age ascending
data

In [3]:
data=pd.read_csv('https://raw.githubusercontent.com/jatinmeenaa/pandas_learning/refs/heads/main/Students%20Social%20Media%20Addiction.csv')
data

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,20,Female,Undergraduate,Italy,4.7,TikTok,No,7.2,7,In Relationship,2,5
701,702,23,Male,Graduate,Russia,6.8,Instagram,Yes,5.9,4,Single,5,9
702,703,21,Female,Undergraduate,China,5.6,WeChat,Yes,6.7,6,In Relationship,3,7
703,704,24,Male,Graduate,Japan,4.3,Twitter,No,7.5,8,Single,2,4


#### iterating row wise ( not much prefrable as not optimized )

df.iterrows() lets you loop through the DataFrame row by row as (index, Series) pairs.

In [None]:
for index, row in data.iterrows():
  print(index)
  print(type(row))
  print('\n\n')

### Filtering the data

#### numeric

In [None]:
data.loc[data['Gender']=='Male'] # all the columns for male

In [None]:
data[data['Gender']=='Male'] # works the same

In [None]:
data.loc[data['Gender']=='Male',['Student_ID','Age']] # specific columns

In [None]:
data[data['Gender']=='Male'][['Student_ID','Age']] # works the same

In [None]:
data[(data['Gender']=='Male')& (data['Age']>19)][['Student_ID','Age']] # multiple conditions

#### string and regex based filtering
- .str is used to apply string functions on each element of a Series containing strings

- Series.str.contains(pat, case=True, flags=0, na=None, regex=True) \\
na=False : treats a missing value as False (avoiding errors) \\
case=False : case doesn't matter

In [35]:
data[data['Country'].str.contains(r'^I')]

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
12,13,22,Male,Graduate,Italy,2.8,LinkedIn,No,7.2,8,Single,1,4
25,26,19,Female,High School,Ireland,6.1,Instagram,Yes,5.2,5,Complicated,4,9
32,33,18,Male,High School,Indonesia,5.4,TikTok,Yes,5.4,5,Complicated,4,8
36,37,22,Male,Graduate,Israel,3.1,Facebook,No,6.8,7,Single,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,676,22,Male,Graduate,India,7.2,Facebook,Yes,5.7,4,Single,5,9
683,684,23,Male,Graduate,Italy,4.8,Facebook,No,7.1,7,In Relationship,2,5
691,692,24,Male,Graduate,Ireland,5.9,Instagram,Yes,6.5,6,Single,3,7
692,693,19,Female,Undergraduate,India,7.0,TikTok,Yes,5.8,4,Single,5,9


In [36]:
data[data['Country'].isin(['UK','USA'])]

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
220,221,19,Female,Undergraduate,USA,6.5,Instagram,Yes,6.0,5,Single,4,9
221,222,21,Male,Graduate,UK,5.8,TikTok,Yes,6.5,6,In Relationship,3,7
228,229,19,Female,Undergraduate,USA,7.0,TikTok,Yes,5.8,4,In Relationship,4,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,666,23,Male,Graduate,UK,6.3,Twitter,Yes,6.2,5,Single,4,8
678,679,21,Female,Undergraduate,USA,5.3,Twitter,Yes,6.8,6,In Relationship,3,7
682,683,20,Female,Undergraduate,UK,6.1,Twitter,Yes,6.4,5,Single,4,8
695,696,23,Male,Graduate,USA,5.5,Twitter,Yes,6.7,6,In Relationship,3,7


In [38]:
data.query('Country=="UK" and Age<=20')

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
597,598,20,Male,Undergraduate,UK,6.4,Facebook,Yes,6.2,5,Single,4,8
682,683,20,Female,Undergraduate,UK,6.1,Twitter,Yes,6.4,5,Single,4,8


### Adding/Removing columns

In [43]:
data['Sal']=0 # Adding a new column with the value as 0
import numpy as np
data['Mental_state']=np.where(data['Mental_Health_Score']<6,'Fail','Pass') # values on some condition
data

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score,Sal,Mental_state
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8,0,Pass
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3,0,Pass
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9,0,Fail
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4,0,Pass
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7,0,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,20,Female,Undergraduate,Italy,4.7,TikTok,No,7.2,7,In Relationship,2,5,0,Pass
701,702,23,Male,Graduate,Russia,6.8,Instagram,Yes,5.9,4,Single,5,9,0,Fail
702,703,21,Female,Undergraduate,China,5.6,WeChat,Yes,6.7,6,In Relationship,3,7,0,Pass
703,704,24,Male,Graduate,Japan,4.3,Twitter,No,7.5,8,Single,2,4,0,Pass


In [None]:
data.drop(columns=['Sal','Mental_state'],inplace=True) # removing columns
# by default inplace=False so only returns and do not modify original
data=data.drop(columns=['Sal','Mental_state']) # made to point to modified data
data

In [None]:
new_data=data # points to same dataframe
new_data=data.copy() # creates a new dataframe and copy the data to that
new_data

In [None]:
data['new_column']=data['Age']+1 # creating the column form existing data
data

In [50]:
data.rename(columns={'new_column': 'new'},inplace=True) # renaming the column
data

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score,new
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8,20
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3,23
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9,21
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4,19
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,20,Female,Undergraduate,Italy,4.7,TikTok,No,7.2,7,In Relationship,2,5,21
701,702,23,Male,Graduate,Russia,6.8,Instagram,Yes,5.9,4,Single,5,9,24
702,703,21,Female,Undergraduate,China,5.6,WeChat,Yes,6.7,6,In Relationship,3,7,22
703,704,24,Male,Graduate,Japan,4.3,Twitter,No,7.5,8,Single,2,4,25
