### Understanding DataFrames
##### A DataFrame is a two-dimensional labeled data structure with columns that can hold different data types. It's a fundamental data structure in Pandas, often used to represent tabular data.

In [23]:
import pandas as pd

# Create a dictionary of data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
    'Age': [25, 30, 28, 22, 27],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Miami', 'Dallas'],
    'Salary': [50000, 60000, 55000, 45000, 65000]
}

# Create a DataFrame
df = pd.DataFrame(data)

print(df)

      Name  Age         City  Salary
0    Alice   25     New York   50000
1      Bob   30  Los Angeles   60000
2  Charlie   28      Chicago   55000
3    David   22        Miami   45000
4    Emily   27       Dallas   65000


In [24]:
# From a List of Lists
data = [
    ['Alice', 25, 'New York', 50000],
    ['Bob', 30, 'Los Angeles', 60000],
    ['Charlie', 28, 'Chicago', 55000],
    ['David', 22, 'Miami', 45000],
    ['Emily', 27, 'Dallas', 65000]
]

df = pd.DataFrame(data, columns=['Name', 'Age', 'City', 'Salary'])

print(df)

      Name  Age         City  Salary
0    Alice   25     New York   50000
1      Bob   30  Los Angeles   60000
2  Charlie   28      Chicago   55000
3    David   22        Miami   45000
4    Emily   27       Dallas   65000


In [25]:
print(df['Name'])  # Access the 'Name' column

0      Alice
1        Bob
2    Charlie
3      David
4      Emily
Name: Name, dtype: object


In [26]:
print(df.iloc[0])  # Access the first row

Name         Alice
Age             25
City      New York
Salary       50000
Name: 0, dtype: object


In [27]:
print(df.loc[0, 'Name'])  # Access the value at the first row, 'Name' column

Alice


In [28]:
df['Age']

0    25
1    30
2    28
3    22
4    27
Name: Age, dtype: int64

In [29]:
# Filter rows where age is greater than 25
filtered_df = df.loc[df['Age'] > 25]
print(filtered_df)

      Name  Age         City  Salary
1      Bob   30  Los Angeles   60000
2  Charlie   28      Chicago   55000
4    Emily   27       Dallas   65000


In [30]:
# Sort by age in ascending order
sorted_df = df.sort_values('Age')
print(sorted_df)

      Name  Age         City  Salary
3    David   22        Miami   45000
0    Alice   25     New York   50000
4    Emily   27       Dallas   65000
2  Charlie   28      Chicago   55000
1      Bob   30  Los Angeles   60000


In [31]:
# Group by city and calculate the average salary
grouped_df = df.groupby('City')['Salary'].mean()
print(grouped_df)

City
Chicago        55000.0
Dallas         65000.0
Los Angeles    60000.0
Miami          45000.0
New York       50000.0
Name: Salary, dtype: float64


In [32]:
print(df.describe())  # Calculate summary statistics

            Age       Salary
count   5.00000      5.00000
mean   26.40000  55000.00000
std     3.04959   7905.69415
min    22.00000  45000.00000
25%    25.00000  50000.00000
50%    27.00000  55000.00000
75%    28.00000  60000.00000
max    30.00000  65000.00000


In [43]:
import pandas as pd

# Create two DataFrames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2'])

df2 = pd.DataFrame({'C': ['C0', 'C1', 'C2'],
                     'D': ['D0', 'D1', 'D2']},
                      index=['K0', 'K1', 'K2'])

# Merge the DataFrames (inner join)
merged_df = pd.merge(df1, df2, left_index=True, right_index=True)
print(merged_df)


# Clean your data: Ensure data consistency and accuracy before merging.
# Use informative column names: Clear column names can help avoid confusion.
# Explore different merge types: Experiment with different merge types to achieve the desired outcome.
# Visualize the data: Use tools like Pandas Profiling to understand the data better.

     A   B   C   D
K0  A0  B0  C0  D0
K1  A1  B1  C1  D1
K2  A2  B2  C2  D2


In [44]:
# Join the DataFrames
joined_df = df1.join(df2)
print(joined_df)

     A   B   C   D
K0  A0  B0  C0  D0
K1  A1  B1  C1  D1
K2  A2  B2  C2  D2


In [45]:
# Create a DatetimeIndex
date_index = pd.date_range('2023-01-01', periods=10)

# Create a DataFrame with DatetimeIndex
df = pd.DataFrame({'Sales': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}, index=date_index)

# Resampling data
resampled_df = df.resample('M').sum()
print(resampled_df)

# Time-based calculations
df['Rolling_Mean'] = df['Sales'].rolling(window=3).mean()
print(df)

            Sales
2023-01-31    550
            Sales  Rolling_Mean
2023-01-01     10           NaN
2023-01-02     20           NaN
2023-01-03     30          20.0
2023-01-04     40          30.0
2023-01-05     50          40.0
2023-01-06     60          50.0
2023-01-07     70          60.0
2023-01-08     80          70.0
2023-01-09     90          80.0
2023-01-10    100          90.0
