In [None]:
import pandas as pd
import numpy as np

data_np = np.array([
    ['Blue', [1, 2], 1.1],
    ['Red', [3, 4], 2.2],
    ['Pink', [5, 6], 3.3],
    ['Grey', [7, 8], 4.4],
    ['Black', [9, 10], 5.5]
], dtype=object)

ecommerce_from_numpy = pd.DataFrame(data_np, index=[1, 3, 5, 7, 9],
                             columns=['color', 'list', 'number'])

print("DataFrame from NumPy array:")
print(ecommerce_from_numpy)

# Create DataFrame from Pandas Series
data_series = {
    'color': pd.Series(['Blue', 'Red', 'Pink', 'Grey', 'Black'],
                       index=[1, 3, 5, 7, 9]),
    'list': pd.Series([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]],
                      index=[1, 3, 5, 7, 9]),
    'number': pd.Series([1.1, 2.2, 3.3, 4.4, 5.5], index=[1, 3, 5,
                                                              7, 9])
}

ecommerce_from_series = pd.DataFrame(data_series)

print("DataFrame from Pandas Series:")
print(ecommerce_from_series)

# Print column types
print("Column types:")
print(ecommerce_from_numpy.dtypes)

# Print types of the first value of every column
print("Types of the first value of every column:")
for col in ecommerce_from_numpy.columns:
    print(f"Column '{col}': {type(ecommerce_from_numpy[col].iloc[0])}")

In [None]:
# --- Data Loading, Cleaning, and Transformation ---

# 1. Load the dataset
print("Step 1: Loading data...")
file_path = 'household_power_consumption_2.txt'
ecommerce = pd.read_csv(file_path, sep=';', na_values=['?'])
ecommerce = ecommerce.drop(columns=['Time', 'Sub_metering_2', 'Sub_metering_3'])
ecommerce = ecommerce.set_index('Date')

# 2. Update data types
print("Step 2: Updating data types...")
def update_types(ecommerce_to_update):
    for col in ecommerce_to_update.columns:
        ecommerce_to_update[col] = pd.to_numeric(ecommerce_to_update[col], errors='coerce')
    ecommerce_to_update.index = pd.to_datetime(ecommerce_to_update.index, format='%d/%m/%Y')
    return ecommerce_to_update
ecommerce_updated = update_types(ecommerce)

ecommerce_updated.describe()

# 3. Drop rows with missing values and create a copy
print("Step 3: Dropping missing values...")
ecommerce_cleaned = ecommerce_updated.dropna().copy()

print("--- Sub_metering_1 before modification ---")
print(ecommerce_cleaned['Sub_metering_1'].head())

# 4. Apply the transformation
print("Step 4: Applying transformation...")
ecommerce_cleaned['Sub_metering_1'] = (ecommerce_cleaned['Sub_metering_1'] + 1) * 0.06

print("--- Sub_metering_1 after modification ---")
print(ecommerce_cleaned['Sub_metering_1'].head())

  # 1. Select rows where Date >= 2008-12-27 and Voltage >= 242
print("--- 1. Filtering Data ---")
filtered_ecommerce = ecommerce_cleaned[(ecommerce_cleaned.index >= '2008-12-27') & (ecommerce_cleaned['Voltage'] >= 242)]
print(f"Found {len(filtered_ecommerce)} rows matching the criteria.")

print("\n--- 2. 88888th Row of Filtered Data ---")
if len(filtered_ecommerce) > 88888:
    print(filtered_ecommerce.iloc[88888])
else:
    print("There are not enough rows in the filtered data to select the 88888th row.")


  # 3. Find the date of the maximum Global_active_power
print("\n--- 3. Date of Maximum Global Active Power ---")
max_power_date = ecommerce_cleaned['Global_active_power'].idxmax()
print(f"The Global_active_power was maximal on: {max_power_date.date()}")


  # 4. Sort the first three columns
print("\n--- 4. Sorted DataFrame (First 3 Columns) ---")
sorted_ecommerce = ecommerce_cleaned.sort_values(by=['Global_active_power','Voltage'], ascending=[False, True])
print(sorted_ecommerce.iloc[:, :3].head()) # Displaying the first 3 columns of the sorted result


  # 5. Compute the daily average of Global_active_power
print("\n--- 5. Daily Average of Global Active Power ---")
daily_avg_power = ecommerce_cleaned['Global_active_power'].resample('D').mean()
print(daily_avg_power.head())



In [None]:
ecommerce = pd.read_csv("Ecommerce_purchases.txt", sep=',')
#print(ecommerce.head())

print(f"Total Rows = {len(ecommerce.index)}")
print(f"Total Rows = {len(ecommerce.columns)}")

purchase_price = np.mean(ecommerce['Purchase Price'])
print(f"Average Purchase Price = {purchase_price}")

english_speakers = ecommerce['Language']
num_english_speakers = len(english_speakers[english_speakers == 'en'])
print(f"Number of English Speakers = {num_english_speakers}")

jobs = ecommerce['Job']
num_lawyers = len(jobs[jobs == 'Lawyer'])
print(f"Number of Lawyers = {num_lawyers}")

am_buyers = ecommerce['AM or PM']
num_am_buyers = len(am_buyers[am_buyers == 'AM'])
print(f"Number of AM Buyers = {num_am_buyers}")

num_pm_buyers = len(am_buyers[am_buyers == 'PM'])
print(f"Number of PM Buyers = {num_pm_buyers}")

top_jobs = jobs.value_counts()
# print(f"Top jobs {top_jobs.head(5)}")

# Step 1: Create a boolean "mask" to find which row has the Lot '90 WT'
# This will be a Series of True/False values.
is_the_correct_lot = ecommerce['Lot'] == '90 WT'

# Step 2: Use the mask to select the entire row from the DataFrame.
# .loc is great for selecting data by labels or boolean conditions.
transaction_row = ecommerce.loc[is_the_correct_lot]

# Step 3: From that specific row, select the value in the 'Purchase Price' column.
# We use .item() to pull the single value out of the Series.
purchase_price = transaction_row['Purchase Price'].item()
print(f"Purchase Price for Lot '90 WT' = {purchase_price}")

credit_card_to_find = 4926535242672853
email_address = ecommerce.loc[ecommerce['Credit Card'] == credit_card_to_find, 'Email'].item()

print(f"Email of the person with Credit Card number '4926535242672853' = {email_address}")

american_express = ecommerce[(ecommerce['CC Provider'] == 'American Express') & (ecommerce['Purchase Price'] >= 95.0)]
print(f'Total american express above $95 {len(american_express)}')


expiry = ecommerce['CC Exp Date'].str.endswith('25')
num_expiring_2025 = expiry.sum()
print(f'Cards expiring in 2025 = {num_expiring_2025}')


ecommerce.columns = ecommerce.columns.str.strip()

# --- Step 1: Extract the domain from each email address ---
# We use the .str accessor to apply string operations to the 'Email' column.
# .split('@') splits each email into a list of two parts: the name and the domain.
# .str[1] selects the second part of that list, which is the domain.
email_providers = ecommerce['Email'].str.split('@').str[1]

# --- Step 2: Count the occurrences of each provider ---
# value_counts() is the perfect tool for this. It counts unique values and
# sorts them in descending order automatically.
provider_counts = email_providers.value_counts()

# --- Step 3: Get the top 5 most popular providers ---
# .head(5) selects the first 5 rows from the sorted counts.
top_5_providers = provider_counts.head(5)

print("Top 5 most popular email providers:")
print(top_5_providers)



In [None]:
import pandas as pd
import numpy as np

iris_data = pd.read_csv('iris.csv', sep= ',')
iris_data.drop(columns=['flower'])
print(iris_data.head(5))


columns_to_convert = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

iris_data[columns_to_convert] = iris_data[columns_to_convert].apply(pd.to_numeric, errors='coerce')

iris_data['sepal_length'] = pd.to_numeric(iris_data['sepal_length'], errors='coerce')


sepal_length_mean = iris_data['sepal_length'].mean()
iris_data.loc[iris_data['sepal_length'].isnull(), 'sepal_length'] = sepal_length_mean

sepal_width_median = iris_data['sepal_width'].median()
iris_data.loc[iris_data['sepal_width'].isnull(), 'sepal_width'] = sepal_width_median

columns_to_zero = ['petal_length', 'petal_width']
for col in columns_to_zero:
    iris_data.loc[iris_data[col].isnull(), col] = 0
    

# Calculate the median for each specified column
column_medians = iris_data[columns_to_convert].median()
print("\nMedians of the columns:")
print(column_medians)
print("-" * 30)

# Use fillna() to replace NaN values with the calculated medians
iris_data[columns_to_convert] = iris_data[columns_to_convert].fillna(column_medians)




Medians of the columns:
sepal_length    5.8
sepal_width     3.0
petal_length    4.0
petal_width     1.3
dtype: float64
------------------------------


# Handling Missing Values in Pandas: Why Mean or Zero Imputation Can Be Problematic

When dealing with missing values in a dataset, simple imputation methods like filling with `0` or the column mean can seem appealing due to their simplicity. However, these approaches often introduce significant issues, especially when encountering problematic data like the "special row" (`122,always,check,the,data,!!!!!!!!`). This document explains why mean or zero imputation is inappropriate, particularly in the context of such a row, and outlines a better approach to handle missing or corrupted data.

## Why Filling Missing Values with 0 or the Mean is a Bad Idea

### 1. Mean or Zero Imputation is Inappropriate for Categorical/Mixed Data
- **Data Type Mismatch**: The "special row" (`122,always,check,the,data,!!!!!!!!`) is a string, not a numeric value. Pandas cannot compute a mean for a column containing such a value and would raise a `TypeError`. Even if you attempt to convert it using `pd.to_numeric(..., errors='coerce')`, the entry becomes `NaN`, indicating that the original information is not numeric. Imputing with `0` or the mean in this case would be meaningless and fail to address the underlying issue.
- **Loss of Information**: Replacing the "special row" with `0` or the mean erases critical evidence of a data quality issue. This row acts as an internal alarm, signaling a failure in data collection, cleaning, or parsing. Imputing with a numeric value hides this problem, potentially allowing broader data quality issues to go unnoticed and propagate through the analysis.

### 2. Mean or Zero Imputation Biases the Data
- **Distorts Distribution**: If you extract a numeric part from the "special row" (e.g., `122`) and impute missing values with it, you risk introducing an outlier that skews the data distribution. Conversely, imputing with `0` in a dataset with large positive values (e.g., sepal lengths) artificially shifts the distribution downward, misrepresenting the data's true range and central tendency.
- **Underestimates Variance**: Mean imputation reduces the variance of a column by replacing missing values with a single fixed value (the mean). This creates an overly optimistic view of the data's consistency, which can lead to incorrect statistical inferences or hypothesis tests. For example, variance-based analyses (e.g., standard deviation, confidence intervals) will be biased, affecting model performance.

### 3. It's an Unsuitable Solution for Context-Dependent Problems
- **Ignores Missingness Mechanism**: The "special row" represents a **Missing Not at Random (MNAR)** scenario, where the missing or corrupted data is related to the value itself (e.g., due to human error or process failure). Mean or zero imputation assumes data is **Missing Completely at Random (MCAR)**, which is inappropriate here. A model trained on such imputed data may fail to learn the true patterns, leading to poor performance in real-world applications.
- **Loss of Relationships**: For models relying on feature relationships (e.g., regression), mean imputation disrupts correlations by assigning all missing values the same number. This weakens or destroys the relationships between features, reducing the model's ability to capture meaningful patterns.

## How to Handle the Special Row Properly

For a data quality issue as blatant as the "special row," simple imputation is the wrong approach. A robust data cleaning process is necessary to address such issues effectively. Below is a step-by-step strategy to handle the "special row" and other missing values:

1. **Isolate the Problem Row**  
   Identify rows containing malformed data, such as the "special row." For example, you can search for rows containing specific string patterns.

   ```python
   bad_data_row = df[df['sepal_length'].astype(str).str.contains("always")]
   ```

2. **Inspect and Decide on an Action**  
   Upon inspection, confirm that the row is invalid and cannot be salvaged. The "special row" (`122,always,check,the,data,!!!!!!!!`) is clearly erroneous and does not represent a valid data point.

3. **Remove the Problematic Row**  
   Drop the row from the DataFrame to eliminate the corrupted data. This is the safest approach when the data is unrecoverable.

   ```python
   df_cleaned = df.drop(bad_data_row.index)
   ```

4. **Fill Remaining Missing Values**  
   After removing the problematic row, apply a robust imputation strategy for any remaining legitimately missing values. For numeric columns, using the median is often more appropriate than the mean, as it is less sensitive to outliers.

   ```python
   df_cleaned = df_cleaned.fillna(df_cleaned.median(numeric_only=True))
   ```

## Conclusion

Filling missing values with `0` or the mean is a poor choice for datasets with corrupted or context-dependent missing data, such as the "special row." These methods can distort distributions, underestimate variance, and hide critical data quality issues. Instead, a careful cleaning process—identifying and removing erroneous data followed by appropriate imputation for legitimate missing values—ensures a more accurate and reliable dataset for analysis. By addressing data quality issues explicitly, you preserve the integrity of your analysis and improve the performance of downstream models.