In [4]:
#Missing values / Empty cells
import pandas as pd

df = pd.read_csv('sales.csv')

#check for empty cells
print(df.isnull().sum())

for col in ['Quantity','Unit Price','Total Revenue']:
    df[col].fillna(df[col].mean(),inplace=True) 
print(df) 

print(df.isnull().sum())


Order ID         0
Customer Name    1
Order Date       0
Product          0
Quantity         1
Unit Price       1
Total Revenue    1
dtype: int64
   Order ID  Customer Name   Order Date   Product   Quantity  Unit Price  \
0      1001       John Doe   01/01/2024  Widget A  10.000000   25.000000   
1      1002     Jane Smith   01/02/2024  Widget B   5.000000   40.000000   
2      1003            NaN  2024/01/03'  Widget A   5.142857   25.000000   
3      1004  Alice Johnson   04/01/2024  Widget C   3.000000   35.714286   
4      1005      Bob Brown  2024/01/05'  Widget B  10.000000   40.000000   
5      1006       John Doe   06/01/2024  Widget A   4.000000   25.000000   
6      1001       John Doe   01/01/2024  Widget A  10.000000   25.000000   
7      1007     Jane Smith   07/01/2024  Widget C  -6.000000   70.000000   

   Total Revenue  
0     250.000000  
1     200.000000  
2     141.428571  
3     210.000000  
4     400.000000  
5     100.000000  
6     250.000000  
7    -420.000000 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)


In [5]:
#Inconsistent data formats

df = pd.read_csv('sales.csv')

df['Order Date'] = pd.to_datetime(df['Order Date'], format='mixed')

print(df.to_string())

   Order ID  Customer Name Order Date   Product  Quantity  Unit Price  Total Revenue
0      1001       John Doe 2024-01-01  Widget A      10.0        25.0          250.0
1      1002     Jane Smith 2024-01-02  Widget B       5.0        40.0          200.0
2      1003            NaN 2024-01-03  Widget A       NaN        25.0            NaN
3      1004  Alice Johnson 2024-04-01  Widget C       3.0         NaN          210.0
4      1005      Bob Brown 2024-01-05  Widget B      10.0        40.0          400.0
5      1006       John Doe 2024-06-01  Widget A       4.0        25.0          100.0
6      1001       John Doe 2024-01-01  Widget A      10.0        25.0          250.0
7      1007     Jane Smith 2024-07-01  Widget C      -6.0        70.0         -420.0


In [8]:
#Duplicate rows

#checks for duplicate rows
print(df.duplicated())

#removes duplicate dfs
df.drop_duplicates(inplace = True)

0    False
1    False
2    False
3    False
4    False
5    False
7    False
dtype: bool


In [9]:
#wrong data
#reset duration or
df.loc[7, 'Quantity'] = 6.0

df.loc[7, 'Total Revenue'] = 420.0

#remove row from df
'''for x in df.index:
  if df.loc[x, "Duration"] > 60:
    df.drop(x, inplace = True)'''

print(df.to_string())

   Order ID  Customer Name Order Date   Product  Quantity  Unit Price  Total Revenue
0      1001       John Doe 2024-01-01  Widget A      10.0        25.0          250.0
1      1002     Jane Smith 2024-01-02  Widget B       5.0        40.0          200.0
2      1003            NaN 2024-01-03  Widget A       NaN        25.0            NaN
3      1004  Alice Johnson 2024-04-01  Widget C       3.0         NaN          210.0
4      1005      Bob Brown 2024-01-05  Widget B      10.0        40.0          400.0
5      1006       John Doe 2024-06-01  Widget A       4.0        25.0          100.0
7      1007     Jane Smith 2024-07-01  Widget C       6.0        70.0          420.0


In [11]:
#Unnecessary columns not relevant to analysis
columns_to_drop = ['Order Date','Total Revenue']
new_df = df.drop(columns= columns_to_drop)
print(new_df.to_string())

   Order ID  Customer Name   Product  Quantity  Unit Price
0      1001       John Doe  Widget A      10.0        25.0
1      1002     Jane Smith  Widget B       5.0        40.0
2      1003            NaN  Widget A       NaN        25.0
3      1004  Alice Johnson  Widget C       3.0         NaN
4      1005      Bob Brown  Widget B      10.0        40.0
5      1006       John Doe  Widget A       4.0        25.0
7      1007     Jane Smith  Widget C       6.0        70.0
