In [49]:
import pandas as pd

In [50]:
df = pd.read_csv('ecommerce/sales_data.csv')
df.columns

Index(['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date',
       'Customer Shipping Address', 'City Store', 'Category',
       'Customer Gender', 'Customer Age Range', 'Discount'],
      dtype='object')

In [51]:
# renaming the columns 
df.columns = [column.lower().replace(' ','_') for column in df.columns]
df.columns

Index(['order_id', 'product', 'quantity_ordered', 'price_each', 'order_date',
       'customer_shipping_address', 'city_store', 'category',
       'customer_gender', 'customer_age_range', 'discount'],
      dtype='object')

In [52]:
df.head(2)

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,customer_shipping_address,city_store,category,customer_gender,customer_age_range,discount
0,236670,Wired Headphones,16,11.99,2019-08-31 22:21:00,"359 Spruce St, Seattle, WA 98101",Dallas,Headphones,Male,18-20,18
1,236671,Bose SoundSport Headphones,9,99.99,2019-08-15 15:11:00,"492 Ridge St, Dallas, TX 75001",Los Angeles,Headphones,Male,21-25,21


In [53]:
# finding null values 
df.isnull().sum()

order_id                     0
product                      0
quantity_ordered             0
price_each                   0
order_date                   0
customer_shipping_address    0
city_store                   0
category                     0
customer_gender              0
customer_age_range           0
discount                     0
dtype: int64

In [54]:
df['order_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 185950 entries, 0 to 185949
Series name: order_id
Non-Null Count   Dtype
--------------   -----
185950 non-null  int64
dtypes: int64(1)
memory usage: 1.4 MB


In [55]:
df['discount'].head(10)

0    0,18
1    0,21
2    0,05
3    0,08
4    0,14
5    0,22
6    0,17
7    0,30
8    0,25
9    0,01
Name: discount, dtype: object

In [57]:
# converting the datatype for column discount
df['discount'] = df['discount'].apply(lambda x : int(x.replace('0,','')))
df['discount'].head(10)

0    18
1    21
2     5
3     8
4    14
5    22
6    17
7    30
8    25
9     1
Name: discount, dtype: int64

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   order_id                   185950 non-null  int64 
 1   product                    185950 non-null  object
 2   quantity_ordered           185950 non-null  int64 
 3   price_each                 185950 non-null  object
 4   order_date                 185950 non-null  object
 5   customer_shipping_address  185950 non-null  object
 6   city_store                 185950 non-null  object
 7   category                   185950 non-null  object
 8   customer_gender            185950 non-null  object
 9   customer_age_range         185950 non-null  object
 10  discount                   185950 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 15.6+ MB


In [59]:
df['price_each'] = df['price_each'].apply(lambda x : float(x.replace('0,','')))
df['price_each'].head()

0     11.99
1     99.99
2    700.00
3      3.84
4      3.84
Name: price_each, dtype: float64

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   order_id                   185950 non-null  int64  
 1   product                    185950 non-null  object 
 2   quantity_ordered           185950 non-null  int64  
 3   price_each                 185950 non-null  float64
 4   order_date                 185950 non-null  object 
 5   customer_shipping_address  185950 non-null  object 
 6   city_store                 185950 non-null  object 
 7   category                   185950 non-null  object 
 8   customer_gender            185950 non-null  object 
 9   customer_age_range         185950 non-null  object 
 10  discount                   185950 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 15.6+ MB


In [61]:
df.to_csv('ecommerce/sales_data_datatype_converted.csv')

In [63]:
df2 = pd.read_csv('ecommerce/sales_data_datatype_converted.csv')
df2.columns

Index(['Unnamed: 0', 'order_id', 'product', 'quantity_ordered', 'price_each',
       'order_date', 'customer_shipping_address', 'city_store', 'category',
       'customer_gender', 'customer_age_range', 'discount'],
      dtype='object')

In [66]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 185950 non-null  int64  
 1   order_id                   185950 non-null  int64  
 2   product                    185950 non-null  object 
 3   quantity_ordered           185950 non-null  int64  
 4   price_each                 185950 non-null  float64
 5   order_date                 185950 non-null  object 
 6   customer_shipping_address  185950 non-null  object 
 7   city_store                 185950 non-null  object 
 8   category                   185950 non-null  object 
 9   customer_gender            185950 non-null  object 
 10  customer_age_range         185950 non-null  object 
 11  discount                   185950 non-null  int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 17.0+ MB


In [72]:
df2.columns[0]

'Unnamed: 0'

In [81]:
df2.iloc[:,1:].to_csv('ecommerce/sales_data_datatype_converted.csv')

In [88]:
df3 = pd.read_csv('ecommerce/sales_data_datatype_converted.csv',index_col = 0)

In [89]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   order_id                   185950 non-null  int64  
 1   product                    185950 non-null  object 
 2   quantity_ordered           185950 non-null  int64  
 3   price_each                 185950 non-null  float64
 4   order_date                 185950 non-null  object 
 5   customer_shipping_address  185950 non-null  object 
 6   city_store                 185950 non-null  object 
 7   category                   185950 non-null  object 
 8   customer_gender            185950 non-null  object 
 9   customer_age_range         185950 non-null  object 
 10  discount                   185950 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 17.0+ MB


In [90]:
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,customer_shipping_address,city_store,category,customer_gender,customer_age_range,discount
0,236670,Wired Headphones,16,11.99,2019-08-31 22:21:00,"359 Spruce St, Seattle, WA 98101",Dallas,Headphones,Male,18-20,18
1,236671,Bose SoundSport Headphones,9,99.99,2019-08-15 15:11:00,"492 Ridge St, Dallas, TX 75001",Los Angeles,Headphones,Male,21-25,21
2,236672,iPhone,8,700.0,2019-08-06 14:40:00,"149 7th St, Portland, OR 97035",New York City,Phone,Male,26-30,5
3,236673,AA Batteries (4-pack),12,3.84,2019-08-29 20:59:00,"631 2nd St, Los Angeles, CA 90001",San Francisco,Batteries,Female,31-40,8
4,236674,AA Batteries (4-pack),16,3.84,2019-08-15 19:53:00,"736 14th St, New York City, NY 10001",Boston,Batteries,Female,41-50,14
