In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('cian_apartments.csv')

# Display original column order
print("Original column order:", df.columns.tolist())

# Create a new empty 'activity_date' column
# You can initialize it with empty strings, NaN values, or some default value
df['activity_date'] = ""  # Or use: df['activity_date'] = pd.NA or df['activity_date'] = None

# Get the list of columns
columns = df.columns.tolist()

# Remove 'activity_date' from its current position (at the end)
columns.remove('activity_date')

# Find the position of 'unpublished_date'
unpublished_date_position = columns.index('unpublished_date')

# Insert 'activity_date' right after 'unpublished_date'
columns.insert(unpublished_date_position + 1, 'activity_date')

# Reorder the DataFrame with the new column order
df = df[columns]

# Display new column order
print("New column order:", df.columns.tolist())

# Save the modified DataFrame back to CSV
df.to_csv('cian_apartments.csv', index=False)

print("File saved successfully with new 'activity_date' column added after 'unpublished_date'")

Original column order: ['offer_id', 'offer_url', 'title', 'address', 'metro_station', 'neighborhood', 'district', 'description', 'status', 'updated_time', 'activity_date', 'unpublished_date', 'price_info', 'distance', 'price_value', 'rental_period', 'utilities_type', 'commission_info', 'deposit_info', 'commission_value', 'deposit_value', 'cian_estimation_value', 'price_change_value', 'price_difference_value', 'cian_estimation']
New column order: ['offer_id', 'offer_url', 'title', 'address', 'metro_station', 'neighborhood', 'district', 'description', 'status', 'updated_time', 'unpublished_date', 'activity_date', 'price_info', 'distance', 'price_value', 'rental_period', 'utilities_type', 'commission_info', 'deposit_info', 'commission_value', 'deposit_value', 'cian_estimation_value', 'price_change_value', 'price_difference_value', 'cian_estimation']
File saved successfully with new 'activity_date' column added after 'unpublished_date'


In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('cian_apartments.csv')

# Display original info
print("Original column types:")
print(df[['unpublished_date', 'activity_date']].dtypes)
print("\nSample unpublished_date values:", df['unpublished_date'].head().tolist())

# Function to check if a value is a valid date
def is_valid_date(date_str):
    return isinstance(date_str, str) and date_str != '--' and len(date_str) > 2

# Copy values from unpublished_date to activity_date if unpublished_date is valid
df['activity_date'] = df.apply(
    lambda row: row['unpublished_date'] if is_valid_date(row['unpublished_date']) else row['activity_date'], 
    axis=1
)

# Display updated info
print("\nAfter update:")
print("Number of rows with activity_date populated:", df['activity_date'].notna().sum())
print("Sample activity_date values:", df['activity_date'].head().tolist())

# Save the modified DataFrame back to CSV
df.to_csv('cian_apartments.csv', index=False)

print("\nFile saved successfully with 'activity_date' values updated from 'unpublished_date'")

Original column types:
unpublished_date     object
activity_date       float64
dtype: object

Sample unpublished_date values: ['--', '--', '--', '--', '--']

After update:
Number of rows with activity_date populated: 150
Sample activity_date values: [nan, nan, nan, nan, nan]

File saved successfully with 'activity_date' values updated from 'unpublished_date'


In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('cian_apartments.csv')

# Get basic info about the DataFrame
print(f"Total rows in dataset: {len(df)}")

# Check for duplicates in offer_id
duplicates = df[df.duplicated('offer_id', keep=False)]
has_duplicates = len(duplicates) > 0

# Print results
print(f"Are there duplicate offer_ids? {has_duplicates}")
print(f"Number of duplicate offer_ids: {len(duplicates)}")

# If duplicates exist, show them
if has_duplicates:
    # Group by offer_id and count occurrences
    duplicate_counts = df['offer_id'].value_counts()
    # Filter to only show IDs that appear more than once
    duplicated_ids = duplicate_counts[duplicate_counts > 1]
    
    print("\nDuplicate offer_ids and their counts:")
    print(duplicated_ids)
    
    # Optionally, show the full records for the first few duplicates
    if len(duplicates) > 0:
        print("\nSample of duplicate records:")
        # Get the first duplicate ID
        first_dup_id = duplicated_ids.index[0]
        # Show all records with this ID
        print(df[df['offer_id'] == first_dup_id])

Total rows in dataset: 569
Are there duplicate offer_ids? False
Number of duplicate offer_ids: 0


In [7]:
import pandas as pd
# Load the CSV file
df = pd.read_csv('cian_apartments.csv')
# Display original info
print("Original column types:")
print(df[['unpublished_date', 'activity_date', 'updated_time']].dtypes)

# Check for the specific row with ID 316016115
example_id = 316016115
example_row = df[df.iloc[:, 0] == example_id]  # Assuming ID is the first column

if not example_row.empty:
    print("\nExample row before update:")
    print(f"ID: {example_id}")
    print(f"unpublished_date: {example_row['unpublished_date'].values[0]}")
    print(f"activity_date: {example_row['activity_date'].values[0]}")
    print(f"updated_time: {example_row['updated_time'].values[0]}")
else:
    print(f"\nRow with ID {example_id} not found in the dataset")

# Function to check if a value is a valid date
def is_valid_date(date_str):
    return isinstance(date_str, str) and date_str != '--' and len(date_str) > 2

# Store original activity_date values for comparison
df['original_activity_date'] = df['activity_date']

# Update activity_date based on the logic
df['activity_date'] = df.apply(
    lambda row: row['unpublished_date'] if is_valid_date(row['unpublished_date']) and (not is_valid_date(row['activity_date']) or pd.isna(row['activity_date']))
              else row['updated_time'] if (not is_valid_date(row['unpublished_date']) or pd.isna(row['unpublished_date'])) and (not is_valid_date(row['activity_date']) or pd.isna(row['activity_date']))
              else row['activity_date'],
    axis=1
)

# Check example row after update if it exists
if not example_row.empty:
    idx = example_row.index[0]
    print("\nExample row after update:")
    print(f"ID: {example_id}")
    print(f"unpublished_date: {df.loc[idx, 'unpublished_date']}")
    print(f"activity_date (before): {df.loc[idx, 'original_activity_date']}")
    print(f"activity_date (after): {df.loc[idx, 'activity_date']}")
    print(f"updated_time: {df.loc[idx, 'updated_time']}")
    
    # Explain what happened with this row
    if df.loc[idx, 'original_activity_date'] != df.loc[idx, 'activity_date']:
        if df.loc[idx, 'activity_date'] == df.loc[idx, 'updated_time']:
            print("\nExplanation: Empty or invalid activity_date was replaced with updated_time value")
        elif df.loc[idx, 'activity_date'] == df.loc[idx, 'unpublished_date']:
            print("\nExplanation: Empty or invalid activity_date was replaced with unpublished_date value")
    else:
        print("\nExplanation: activity_date was already valid, no changes were made")

# Show counts of changes
changed_rows = df[df['activity_date'] != df['original_activity_date']]
print(f"\nTotal rows changed: {len(changed_rows)}")
print(f"Rows updated from updated_time: {len(changed_rows[changed_rows['activity_date'] == changed_rows['updated_time']])}")
print(f"Rows updated from unpublished_date: {len(changed_rows[changed_rows['activity_date'] == changed_rows['unpublished_date']])}")

# Find rows where activity_date is still invalid after the update
invalid_rows = []
for idx, row in df.iterrows():
    if not is_valid_date(row['activity_date']) and not pd.isna(row['activity_date']):
        invalid_rows.append((idx, row))

print(f"\nNumber of rows where activity_date is still invalid: {len(invalid_rows)}")

if len(invalid_rows) > 0:
    print("\nSample rows with invalid activity_date after update:")
    print("Row#  activity_date  updated_time  unpublished_date")
    print("-------------------------------------------------")
    for i, (idx, row) in enumerate(invalid_rows):
        print(f"{idx}: {row['activity_date']} | {row['updated_time']} | {row['unpublished_date']}")
        if i >= 9:  # Show only first 10 examples
            print(f"... and {len(invalid_rows) - 10} more rows")
            break

# Drop the temporary column
df = df.drop('original_activity_date', axis=1)

# Save the modified DataFrame back to CSV
df.to_csv('cian_apartments.csv', index=False)
print("\nFile saved successfully with updated 'activity_date' values")

Original column types:
unpublished_date    object
activity_date       object
updated_time        object
dtype: object

Example row before update:
ID: 316016115
unpublished_date: --
activity_date: nan
updated_time: 2025-04-09 14:45:00

Example row after update:
ID: 316016115
unpublished_date: --
activity_date (before): nan
activity_date (after): 2025-04-09 14:45:00
updated_time: 2025-04-09 14:45:00

Explanation: Empty or invalid activity_date was replaced with updated_time value

Total rows changed: 19
Rows updated from updated_time: 19
Rows updated from unpublished_date: 0

Number of rows where activity_date is still invalid: 0

File saved successfully with updated 'activity_date' values
