In [1]:
import pandas as pd

# Step 2: Define the path to your CSV file
file_path = '/content/drive/My Drive/organizations-100.csv'

# Step 3: Load the dataset
df = pd.read_csv(file_path)

# Optional: Preview the dataset
df.head()


Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
1,2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
2,3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
3,4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
4,5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870


In [2]:
# Step 1: Data Ingestion - Initial Sanity Checks
initial_shape = df.shape
basic_info = df.info()
head = df.head()

# Step 2: Deduplication
df = df.drop_duplicates()

# Step 3: Column Management
# Example: drop columns with 'id' or 'note' in their name
columns_to_drop = [col for col in df.columns if 'id' in col.lower() or 'note' in col.lower()]
df = df.drop(columns=columns_to_drop)

# Rename columns for clarity (example renaming)
df = df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"))

# Step 4: Missing Value Handling
missing_summary = df.isna().sum()

# Drop rows/columns if >70% missing
threshold = 0.7
row_thresh = int((1 - threshold) * df.shape[1])
df = df.dropna(thresh=row_thresh)

col_thresh = int((1 - threshold) * df.shape[0])
df = df.dropna(axis=1, thresh=col_thresh)

# Fill remaining missing values
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    elif df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])

# Step 5: Data Type Correction
for col in df.columns:
    # Try converting to datetime
    try:
        df[col] = pd.to_datetime(df[col])
    except:
        pass
    # Try converting to numeric
    try:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
    except:
        pass

# Step 6: Format Standardization
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.lower().str.strip()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Index                100 non-null    int64 
 1   Organization Id      100 non-null    object
 2   Name                 100 non-null    object
 3   Website              100 non-null    object
 4   Country              100 non-null    object
 5   Description          100 non-null    object
 6   Founded              100 non-null    int64 
 7   Industry             100 non-null    object
 8   Number of employees  100 non-null    int64 
dtypes: int64(3), object(6)
memory usage: 7.2+ KB


  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')
  df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors='ignore')


Unnamed: 0,index,name,website,country,description,founded,industry,number_of_employees
0,1970-01-01 00:00:00.000000001,ferrell llc,https://price.net/,papua new guinea,horizontal empowering knowledgebase,1970-01-01 00:00:00.000001990,plastics,1970-01-01 00:00:00.000003498
1,1970-01-01 00:00:00.000000002,mckinney riley and day,http://www.hall-buchanan.info/,finland,user-centric system-worthy leverage,1970-01-01 00:00:00.000002015,glass / ceramics / concrete,1970-01-01 00:00:00.000004952
2,1970-01-01 00:00:00.000000003,hester ltd,http://sullivan-reed.com/,china,switchable scalable moratorium,1970-01-01 00:00:00.000001971,public safety,1970-01-01 00:00:00.000005287
3,1970-01-01 00:00:00.000000004,holder-sellers,https://becker.com/,turkmenistan,de-engineered systemic artificial intelligence,1970-01-01 00:00:00.000002004,automotive,1970-01-01 00:00:00.000000921
4,1970-01-01 00:00:00.000000005,mayer group,http://www.brewer.com/,mauritius,synchronized needs-based challenge,1970-01-01 00:00:00.000001991,transportation,1970-01-01 00:00:00.000007870
