In [3]:
import pandas as pd

In [4]:
# load the Superstore dataset
df=pd.read_csv('/content/Superstore.csv', encoding='ISO-8859-1')

# Display info and column names for reference
df.info(), df.columns.tolist()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

(None,
 ['Row ID',
  'Order ID',
  'Order Date',
  'Ship Date',
  'Ship Mode',
  'Customer ID',
  'Customer Name',
  'Segment',
  'Country',
  'City',
  'State',
  'Postal Code',
  'Region',
  'Product ID',
  'Category',
  'Sub-Category',
  'Product Name',
  'Sales',
  'Quantity',
  'Discount',
  'Profit'])

In [5]:
# 1.Pivot: Total Sales by Region and Category
df.pivot_table(
    values='Sales',
    index='Region',
    columns='Category',
    aggfunc='sum'
)

Category,Furniture,Office Supplies,Technology
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central,163797.1638,167026.415,170416.312
East,208291.204,205516.055,264973.981
South,117298.684,125651.313,148771.908
West,252612.7435,220853.249,251991.832


In [6]:
# 2.Pivot Table: Average Profit per Segment and Category
df.pivot_table(
    values='Profit',
    index='Segment',
    columns='Category',
    aggfunc='mean'
)

Category,Furniture,Office Supplies,Technology
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Consumer,6.281293,18.014174,74.445646
Corporate,11.741201,22.102923,79.723823
Home Office,10.705465,24.034439,89.152458


In [7]:
# 3.Unpivot wide to long format for Sales and Profit
df.melt(
    id_vars=['Order ID', 'Region', 'Category'],
    value_vars=['Sales', 'Profit'],
    var_name='Metric',
    value_name='Amount'
).head()

Unnamed: 0,Order ID,Region,Category,Metric,Amount
0,CA-2016-152156,South,Furniture,Sales,261.96
1,CA-2016-152156,South,Furniture,Sales,731.94
2,CA-2016-138688,West,Office Supplies,Sales,14.62
3,US-2015-108966,South,Furniture,Sales,957.5775
4,US-2015-108966,South,Office Supplies,Sales,22.368


In [8]:
# Classify Profit Margins
def classify_margin(profit):
    if profit > 100:
        return 'High'
    elif profit > 0:
        return 'Medium'
    else:
        return 'Low'

df['Profit_Margin_Category'] = df['Profit'].apply(classify_margin)

#Flag high discount rows
df['High_Discount_Flag'] = df['Discount'].apply(lambda x: x > 0.9)

#Preview changes
print(df[['Profit', 'Profit_Margin_Category', 'Discount', 'High_Discount_Flag']].head())


     Profit Profit_Margin_Category  Discount  High_Discount_Flag
0   41.9136                 Medium      0.00               False
1  219.5820                   High      0.00               False
2    6.8714                 Medium      0.00               False
3 -383.0310                    Low      0.45               False
4    2.5164                 Medium      0.20               False


In [9]:
#Replace 'Consumer' with 'Retail' in Segment column
df['Segment_Cleaned'] = df['Segment'].replace({'Consumer': 'Retail'})

#Map Region names to codes (for demonstration)
region_map = {
    'West': 'W',
    'East': 'E',
    'Central': 'C',
    'South': 'S'
}
df['Region_Code'] = df['Region'].map(region_map)

# Preview results
print(df[['Segment', 'Segment_Cleaned', 'Region', 'Region_Code']].head())


     Segment Segment_Cleaned Region Region_Code
0   Consumer          Retail  South           S
1   Consumer          Retail  South           S
2  Corporate       Corporate   West           W
3   Consumer          Retail  South           S
4   Consumer          Retail  South           S


In [10]:
# Split the first 10 and next 10 rows
df1 = df.iloc[:10]
df2 = df.iloc[10:20]

# Concatenate vertically
df_vert = pd.concat([df1, df2], axis=0, ignore_index=True)

print("Vertical Concatenation:")
print(df_vert.head())


Vertical Concatenation:
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     Customer Name    Segment        Country             City  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  Sub-Category                                       Product Name   

In [11]:
# Select different columns
df3 = df[['Order ID', 'Customer Name']].iloc[:10]
df4 = df[['Product Name', 'Sales']].iloc[:10]

# Concatenate horizontally
df_horiz = pd.concat([df3, df4], axis=1)

print("Horizontal Concatenation:")
print(df_horiz.head())


Horizontal Concatenation:
         Order ID    Customer Name  \
0  CA-2016-152156      Claire Gute   
1  CA-2016-152156      Claire Gute   
2  CA-2016-138688  Darrin Van Huff   
3  US-2015-108966   Sean O'Donnell   
4  US-2015-108966   Sean O'Donnell   

                                        Product Name     Sales  
0                  Bush Somerset Collection Bookcase  261.9600  
1  Hon Deluxe Fabric Upholstered Stacking Chairs,...  731.9400  
2  Self-Adhesive Address Labels for Typewriters b...   14.6200  
3      Bretford CR4500 Series Slim Rectangular Table  957.5775  
4                     Eldon Fold 'N Roll Cart System   22.3680  


In [12]:
#Build a pipeline:
import pandas as pd

df = pd.read_csv("/content/Superstore.csv", encoding="ISO-8859-1")


In [13]:
# Drop rows with missing values in critical columns
df_cleaned = df.dropna(subset=['Sales', 'Profit', 'Category', 'Region'])

# Filter only orders with profit greater than 0
df_filtered = df_cleaned[df_cleaned['Profit'] > 0]


In [14]:
# Group by Category and Region to get total Profit
grouped = df_filtered.groupby(['Category', 'Region'])['Profit'].sum().reset_index()


In [15]:
# Add a column to label profit tiers
def label_tier(value):
    if value > 10000:
        return 'High'
    elif value > 1000:
        return 'Medium'
    else:
        return 'Low'

grouped['Profit_Tier'] = grouped['Profit'].apply(label_tier)


In [17]:
# Pivot to view Profit by Category and Region
grouped.pivot(index='Region', columns='Category', values='Profit')


Category,Furniture,Office Supplies,Technology
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central,16683.3159,42364.1454,36973.7862
East,21847.9512,50805.769,68459.6673
South,16693.2307,29699.904,27861.1279
West,24162.884,56236.2409,50740.2849
