# List of codes used in Project - Diwali Sales Analysis

### Data Cleaning

In [None]:
df = pd.read_csv('Sarwar Ayaan Ali - Diwali Sales Data.csv', encoding= 'unicode_escape')        # Importing csv file

df.shape                                                       # Count of rows and columns

df.head(10)                                                    # df.head - Raw format data ; df.head() - Tabular format data ; df.head(10) - 10 rows

df.info()                                                      # Details of rows and column with Null count and Data Type

df.drop(['Status', 'unnamed1'], axis=1, inplace=True)          # Dropped non-related/nulls. axis=1 (Removes entire row) ; inplace=True (Permanent Change)

pd.isnull(df)                                                  # Null value present in each column

pd.isnull(df).sum()                                            # Count of nulls in each column

df.dropna(inplace=True)                                        # dropped rows containing null values

pd.isnull(df).sum()                                            # I have dropped all the nulls from all columns

df['Amount'] = df['Amount'].astype('int')                      # Changes data type from float to int

df['Amount'].dtypes                                            # Data type has been changes successfully

df.columns                                                     # To check all columns names

df.rename(columns= {'Marital_Status':'Marital_Status'},  inplace=True)                         # Renaming a column, if needed

df.columns                                                     # We can see the name has been changed

df.describe()                                                  # Description of the data in the DataFrame (i.e. count, mean, std, min, 25%, 50%, 75%, max)

df[['Age', 'Orders', 'Amount']].describe()                     # Used describe() for specific columns

# Exploratory Data Analysis

### Gender

In [None]:
# Bar Chart - Count of Gender

axis = sns.countplot(x = 'Gender', data = df)          # Seaborn countplot for the Count

for bars in axis.containers:                           # Bar Label using for loop
    axis.bar_label(bars)
    
# -------------------------------------------------------------------------------------------------------------
# Bar Chart - Gender VS Total amount

gender_amount = df.groupby(['Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)       # Group and sort data

gender_amount['Amount'] = gender_amount['Amount'] / 1000000                # Convert values to millions

axis = sns.barplot(x = 'Gender', y = 'Amount', data = gender_amount, hue = 'Amount')       # Bar chart

for bars in axis.containers:                                               # Bar Label using for loop
    axis.bar_label(bars, fmt='%1.1fM')

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the buyers are females and even the purchasing power of females are greater than men*

### Age

In [None]:
# Count of Age group.

age_gender = sns.countplot(data = df.sort_values(by='Age Group', ascending=True), x = 'Age Group', hue = 'Gender')      # Group and sort data. Hue to categorize

for bars in age_gender.containers:                                                           # Bar Label using for loop
    age_gender.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# Bar Chart - Age group VS Amount

age_amount = df.groupby(['Age Group'], as_index=False)['Amount'].sum().sort_values(by='Age Group', ascending=True)      # Group and sort data

age_amount['Amount'] = age_amount['Amount'] / 1000000                                        # Convert values to millions

axis = sns.barplot(x = 'Age Group', y= 'Amount', data = age_amount, hue = 'Amount')          # Bar chart

for bars in axis.containers:                                                                 # Bar Label using for loop
    axis.bar_label(bars, fmt='%1.1fM')

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the buyers are of age group between 26-35 yrs female*

### State

In [None]:
# Bar Chart - State VS Orders

state_orders = df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)         # Group and sort data

axis = sns.barplot(data = state_orders, x = 'State',y= 'Orders', hue = 'Orders')           # Bar chart

for bars in axis.containers:                                                               # Bar Label using for loop
    axis.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the orders & total sales/amount are from Uttar Pradesh, Maharashtra and Karnataka respectively*

### Marital Status

In [None]:
# Bar Chart - Count of Marital_Status. 0 = Married ; 1 = Unmarried

axis = sns.countplot(data = df, x = 'Marital_Status')                           # Bar chart

for bars in axis.containers:                                                    # Bar Label using for loop
    axis.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# Bar Chart - Marital_Status VS Amount as per Gender

marital_amount_per_gender = df.groupby(['Marital_Status', 'Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)         # Group and sort data.

marital_amount_per_gender['Amount'] = marital_amount_per_gender['Amount'] / 1000000                          # Convert values to millions

axis = sns.barplot(data = marital_amount_per_gender, x = 'Marital_Status',y= 'Amount', hue='Gender')         # Bar chart

for bars in axis.containers:                                                                                 # Bar Label using for loop
    axis.bar_label(bars, fmt='%1.1fM')

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the buyers are married (women) and they have high purchasing power*

### Occuptaion

In [None]:
# Bar Chart - Count of Occupation

axis = sns.countplot(data = df, x = 'Occupation')                                   # Bar chart

for bars in axis.containers:                                                        # Bar Label using for loop
    axis.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# Bar Chart - Occupation VS Amount

occupation_amount = df.groupby(['Occupation'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)        # Group and sort data

occupation_amount['Amount'] = occupation_amount['Amount'] / 1000000                         # Convert values to millions

axis = sns.barplot(data = sales_state, x = 'Occupation',y= 'Amount',  hue='Amount')         # Bar chart

for bars in axis.containers:                                                                # Bar Label using for loop
    axis.bar_label(bars, fmt='%1.1fM')

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the buyers are working in IT, Healthcare and Aviation sector*

### Product Category

In [None]:
# Bar Chart - Product_Category

axis = sns.countplot(data = df, x = 'Product_Category')                                # Bar chart

for bars in axis.containers:                                                           # Bar Label using for loop
    axis.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# Bar Chart - Product_Category VS Amount

category_amount = df.groupby(['Product_Category'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)         # Group and sort data

category_amount['Amount'] = category_amount['Amount'] / 1000000                                        # Convert values to millions

axis = sns.barplot(data = category_amount, x = 'Product_Category',y= 'Amount',  hue='Amount')          # Bar chart

for bars in axis.containers:                                                                           # Bar Label using for loop
    axis.bar_label(bars, fmt='%1.1fM')

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the sold products are from Food, Clothing and Electronics category*

### Product ID

In [None]:
# Bar Chart - Product_ID VS Orders

id_orders = df.groupby(['Product_ID'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)         # Group and sort data

axis = sns.barplot(data = id_orders, x = 'Product_ID',y= 'Orders',  hue='Orders')            # Bar chart

for bars in axis.containers:                                                                 # Bar Label using for loop
    axis.bar_label(bars)

# -------------------------------------------------------------------------------------------------------------
# *From above graphs we can see that most of the orders are for the Product ID P00265242*

### Conclusion

*Married women age group 26-35 yrs from UP,  Maharastra and Karnataka working in IT, Healthcare and Aviation are more likely to buy products from Food, Clothing and Electronics category*

### Project Learnings

*Performed Data Cleaning and Manipulation*</br>
*Performed Exploratory Data Analysis (EDA) using Pandas, matplotlib and seaborn libraries*</br>
*Improved customer experience by identifying potential customers across different states, occupation, gender and age groups*</br>
*Improved sales by identifying most selling product categories and products, which can help to plan inventory and hence meet the demands*