# File Handling
Read and write files using Python.

In [None]:
with open("example.txt", "w") as file:  #Writing
    file.write("Hello, World!")

with open("example.txt", "r") as file:  #Reading 
    content = file.read()
    print("File content:", content)

# Create a DataFrame

In [None]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'Los Angeles', 'Chicago']}

df = pd.DataFrame(data)
print("DataFrame:\n", df)

# To Access a specific columns

In [None]:
print("\nNames:\n", df['Name'])

# To Filter rows

In [None]:
print("\nPeople older than 30:\n", df[df['Age'] > 30])

# Selecting a specific column

In [None]:
df['Age']

# Selecting multiple columns

In [None]:
df[['Name', 'Salary']]

# Selecting a specific row using loc

In [None]:
df.loc[1]

# Selecting a row using iloc

In [None]:
df.iloc[2]

# Data frame to CSV

In [None]:
import pandas as pd
data_dict = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'Salary': [50000, 55000, 60000, 65000, 70000]
}

df = pd.DataFrame(data_dict)
df.to_csv('data.csv', index=False)  
print("See the CSV file in the folder")

# Reading data from the CSV file

In [None]:
df_csv = pd.read_csv('data.csv') #we can read our own CSV file too
print("\nData read from CSV file:")
print(df_csv)

# data to Excel file

In [None]:
df.to_excel('data.xlsx', index=False, sheet_name='Sheet1')  # Writing to Excel
print("\nExcel file written successfully.")

# Reading data from the Excel file

In [None]:
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
print("\nData read from Excel file:")
print(df_excel)

# Mean of Age and Salary

In [None]:
df[['Age', 'Salary']].mean()

# Median of Age and Salary

In [None]:

print("\nMedian of Age and Salary:")
print(df[['Age', 'Salary']].median())

# Mode of Age and Salary

In [None]:

print("\nMode of Age and Salary:")
print(df[['Age', 'Salary']].mode())

# Summarize the data

In [None]:
print("\nSummary Statistics:\n", df.describe())

Seaborn is a visualization library for statistical graphics plotting in Python.

Key Features:

Simplifies complex visualizations like heatmaps, violin plots, and pair plots.

Supports statistical functions (e.g., regression plots).

Works efficiently with Pandas for easy data handling.

Provides built-in datasets for practice (e.g., tips, titanic, iris).

In [None]:
pip install seaborn

# Creating a bar plot for a Sample dataset using Seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


data = pd.DataFrame({
    'Category': ['A', 'B', 'C', 'D', 'E'],
    'Values': [10, 20, 15, 25, 30]
})


sns.barplot(x='Category', y='Values', data=data)
plt.title("Seaborn Bar Plot")
plt.show()


# Seaborn comes with several built-in datasets that you can use for practice and testing. These datasets are included with the library and can be loaded using sns.load_dataset().

In [None]:
import seaborn as sns

print(sns.get_dataset_names())

# Loading dataset from seaborn

Syntax: sns.load_dataset("name of the dataset")

In [None]:
df = sns.load_dataset("tips")  
print(df.head())

In [None]:
sns.scatterplot(x='total_bill', y='tip', hue='sex', data=df)
plt.title('Scatter Plot with Seaborn')
plt.show()

# Dataset information

In [None]:
print(df.info())

# Titanic Dataset

In [1]:
import seaborn as sns
import pandas as pd

df = sns.load_dataset('titanic')

print("First 5 rows of the Titanic dataset:")
print(df.head())

print("\nDataset Information:")
print(df.info())

First 5 rows of the Titanic dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------ 

# Missing Values

In [2]:
print("\nMissing Values in Each Column:")
print(df.isnull().sum())


Missing Values in Each Column:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


# Filling the  missing values with the median

In [3]:
df['age'] = df['age'].fillna(df['age'].median())
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

# To Check if a column exists or not

In [4]:
if 'cabin' in df.columns:
    print("Column 'cabin' exist.")
else:
    print("Column 'cabin' does not exist.")

Column 'cabin' does not exist.


# To drop a column (if its not required)

In [5]:
df_new = df.drop('embarked',axis=1)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   class        891 non-null    category
 8   who          891 non-null    object  
 9   adult_male   891 non-null    bool    
 10  deck         203 non-null    category
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(4)
memory usage: 73.7+ KB


# To Rename a column  

In [7]:
df.rename(columns={'fare': 'Ticket Price'}, inplace=True)
df.info() # or df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   survived      891 non-null    int64   
 1   pclass        891 non-null    int64   
 2   sex           891 non-null    object  
 3   age           891 non-null    float64 
 4   sibsp         891 non-null    int64   
 5   parch         891 non-null    int64   
 6   Ticket Price  891 non-null    float64 
 7   embarked      889 non-null    object  
 8   class         891 non-null    category
 9   who           891 non-null    object  
 10  adult_male    891 non-null    bool    
 11  deck          203 non-null    category
 12  embark_town   889 non-null    object  
 13  alive         891 non-null    object  
 14  alone         891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


# Sort by age in ascending order

In [9]:
sorted_by_age = df.sort_values(by='age', ascending=True)
print("\nDataset sorted by age:")
print(sorted_by_age.head())



Dataset sorted by age:
     survived  pclass     sex   age  sibsp  parch  Ticket Price embarked  \
803         1       3    male  0.42      0      1        8.5167        C   
755         1       2    male  0.67      1      1       14.5000        S   
644         1       3  female  0.75      2      1       19.2583        C   
469         1       3  female  0.75      2      1       19.2583        C   
831         1       2    male  0.83      1      1       18.7500        S   

      class    who  adult_male deck  embark_town alive  alone  
803   Third  child       False  NaN    Cherbourg   yes  False  
755  Second  child       False  NaN  Southampton   yes  False  
644   Third  child       False  NaN    Cherbourg   yes  False  
469   Third  child       False  NaN    Cherbourg   yes  False  
831  Second  child       False  NaN  Southampton   yes  False  


# Filter passengers who paid more than 50 for their ticket

In [10]:
high_fare_passengers = df[df['Ticket Price'] > 50]
print("\nPassengers who paid more than 50:")
print(high_fare_passengers.head())


Passengers who paid more than 50:
    survived  pclass     sex   age  sibsp  parch  Ticket Price embarked  \
1          1       1  female  38.0      1      0       71.2833        C   
3          1       1  female  35.0      1      0       53.1000        S   
6          0       1    male  54.0      0      0       51.8625        S   
27         0       1    male  19.0      3      2      263.0000        S   
31         1       1  female  28.0      1      0      146.5208        C   

    class    who  adult_male deck  embark_town alive  alone  
1   First  woman       False    C    Cherbourg   yes  False  
3   First  woman       False    C  Southampton   yes  False  
6   First    man        True    E  Southampton    no   True  
27  First    man        True    C  Southampton    no  False  
31  First  woman       False    B    Cherbourg   yes  False  


# Add a column

In [12]:
df['family_size'] = df['sibsp'] + df['parch'] + 1
print("\nDataset with family size:")
print(df[['sibsp', 'parch', 'family_size']].head())


Dataset with family size:
   sibsp  parch  family_size
0      1      0            2
1      1      0            2
2      0      0            1
3      1      0            2
4      0      0            1


# Modify an existing column
###categorize age

In [13]:
df['age_group'] = df['age'].apply(lambda x: 'Child' if x < 18 else 'Adult')
print("\nDataset with age groups:")
print(df[['age', 'age_group']].head())


Dataset with age groups:
    age age_group
0  22.0     Adult
1  38.0     Adult
2  26.0     Adult
3  35.0     Adult
4  35.0     Adult


In [14]:
#Now the dataset is
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   survived      891 non-null    int64   
 1   pclass        891 non-null    int64   
 2   sex           891 non-null    object  
 3   age           891 non-null    float64 
 4   sibsp         891 non-null    int64   
 5   parch         891 non-null    int64   
 6   Ticket Price  891 non-null    float64 
 7   embarked      889 non-null    object  
 8   class         891 non-null    category
 9   who           891 non-null    object  
 10  adult_male    891 non-null    bool    
 11  deck          203 non-null    category
 12  embark_town   889 non-null    object  
 13  alive         891 non-null    object  
 14  alone         891 non-null    bool    
 15  family_size   891 non-null    int64   
 16  age_group     891 non-null    object  
dtypes: bool(2), category(2), float64(2), int64(5), object(

# Grouping and calculating the average 

## Group by class and calculate average ticket price

In [16]:
avg_ticket_price = df.groupby('pclass')['Ticket Price'].mean()
print("\nAverage ticket price by class:")
print(avg_ticket_price) 


Average ticket price by class:
pclass
1    84.154687
2    20.662183
3    13.675550
Name: Ticket Price, dtype: float64


## Group by gender and calculate survival rate

In [17]:
survival_rate = df.groupby('sex')['survived'].mean()
print("\nSurvival rate by gender:")
print(survival_rate)


Survival rate by gender:
sex
female    0.742038
male      0.188908
Name: survived, dtype: float64
