In [1]:
import pandas as pd

In [9]:
df = pd.read_csv('dataset_corrections_1227.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926 entries, 0 to 925
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           926 non-null    object
 1   h1_tags         926 non-null    object
 2   word_count      926 non-null    int64 
 3   keywords        102 non-null    object
 4   description     101 non-null    object
 5   DC.Description  101 non-null    object
 6   snapshot_url    926 non-null    object
 7   Timestamp       926 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.0+ KB


In [13]:
df.shape

(926, 8)

In [15]:
# 1. Identify missing values
missing_values = df.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)

Missing Values in Each Column:
title               0
h1_tags             0
word_count          0
keywords          824
description       825
DC.Description    825
snapshot_url        0
Timestamp           0
dtype: int64


In [17]:
# 2. Replace missing values in specific columns
df['keywords'].fillna('No keywords found', inplace=True)
df['description'].fillna('No description found', inplace=True)
df['DC.Description'].fillna('No description found', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['keywords'].fillna('No keywords found', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna('No description found', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [19]:
# Verify replacements
print("\nMissing Values After Replacement:")
print(df.isnull().sum())


Missing Values After Replacement:
title             0
h1_tags           0
word_count        0
keywords          0
description       0
DC.Description    0
snapshot_url      0
Timestamp         0
dtype: int64


In [21]:
df.head()

Unnamed: 0,title,h1_tags,word_count,keywords,description,DC.Description,snapshot_url,Timestamp
0,Corrections Department NZ - Corrections Depart...,[],1495,No keywords found,No description found,No description found,https://web.archive.org/web/20150103022537/htt...,3/01/2015 2:25
1,Corrections Department NZ - Corrections Depart...,[],1495,No keywords found,No description found,No description found,https://web.archive.org/web/20150113020105/htt...,13/01/2015 2:01
2,Corrections Department NZ - Corrections Depart...,[],1498,No keywords found,No description found,No description found,https://web.archive.org/web/20150121022536/htt...,21/01/2015 2:25
3,Corrections Department NZ - Corrections Depart...,[],1503,No keywords found,No description found,No description found,https://web.archive.org/web/20150202082918/htt...,2/02/2015 8:29
4,Corrections Department NZ - Corrections Depart...,[],1507,No keywords found,No description found,No description found,https://web.archive.org/web/20150206050235/htt...,6/02/2015 5:02


In [23]:
# 3. Remove rows containing 'Error'
df = df[~df.eq('Error').any(axis=1)]

In [25]:
# Verify removal
print("\nRows after removing 'Error':")
print(df.head())


Rows after removing 'Error':
                                               title h1_tags  word_count  \
0  Corrections Department NZ - Corrections Depart...      []        1495   
1  Corrections Department NZ - Corrections Depart...      []        1495   
2  Corrections Department NZ - Corrections Depart...      []        1498   
3  Corrections Department NZ - Corrections Depart...      []        1503   
4  Corrections Department NZ - Corrections Depart...      []        1507   

            keywords           description        DC.Description  \
0  No keywords found  No description found  No description found   
1  No keywords found  No description found  No description found   
2  No keywords found  No description found  No description found   
3  No keywords found  No description found  No description found   
4  No keywords found  No description found  No description found   

                                        snapshot_url        Timestamp  
0  https://web.archive.org/web/2

In [29]:
# 4. Check if "description" and "DC.Description" match
df['description_DC_match'] = df['description'] == df['DC.Description']

# Display matching and non-matching rows
matching_rows = df[df['description_DC_match'] == True]
non_matching_rows = df[df['description_DC_match'] == False]

print("Number of Matching Rows:", len(matching_rows))
print("Number of Non-Matching Rows:", len(non_matching_rows))


Number of Matching Rows: 923
Number of Non-Matching Rows: 0


In [31]:
# 5. Drop the "DC.Description' column, as it is the same as 'description'
df.drop(columns=['DC.Description'], inplace=True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 923 entries, 0 to 925
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 923 non-null    object
 1   h1_tags               923 non-null    object
 2   word_count            923 non-null    int64 
 3   keywords              923 non-null    object
 4   description           923 non-null    object
 5   snapshot_url          923 non-null    object
 6   Timestamp             923 non-null    object
 7   description_DC_match  923 non-null    bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 58.6+ KB


In [35]:
df.drop(columns=['description_DC_match'], inplace=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 923 entries, 0 to 925
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         923 non-null    object
 1   h1_tags       923 non-null    object
 2   word_count    923 non-null    int64 
 3   keywords      923 non-null    object
 4   description   923 non-null    object
 5   snapshot_url  923 non-null    object
 6   Timestamp     923 non-null    object
dtypes: int64(1), object(6)
memory usage: 57.7+ KB


In [41]:
# 6. Replace the "h1_tag"= "[]" with "null"

if 'h1_tags' in df.columns:
    df['h1_tags'] = df['h1_tags'].apply(lambda x: 'null' if str(x).strip() == '[]' else x)
    print("Empty lists in 'h1_tags' have been replaced with 'null'.")
else:
    print("Column 'h1_tags' does not exist in the dataset.")

Empty lists in 'h1_tags' have been replaced with 'null'.


In [43]:
df['h1_tags']

0      null
1      null
2      null
3      null
4      null
       ... 
921    null
922    null
923    null
924    null
925    null
Name: h1_tags, Length: 923, dtype: object

In [45]:
# 7. Add a colunm of description length, counting by characters
if 'description' in df.columns:
    df['description_length'] = df['description'].apply(
        lambda x: 0 if str(x).strip() == 'No description found' else len(str(x))
    )
    print("Column 'description_length' has been added successfully.")
else:
    print("Column 'description' does not exist in the dataset.")

# Display the first few rows to verify
print(df[['description', 'description_length']].head())

Column 'description_length' has been added successfully.
            description  description_length
0  No description found                   0
1  No description found                   0
2  No description found                   0
3  No description found                   0
4  No description found                   0


In [49]:
# 8. Add a column 'keyword_count', counting the number of keywords
if 'keywords' in df.columns:
    df['keyword_count'] = df['keywords'].apply(
        lambda x: 0 if str(x).strip() == 'No keywords found' else len(str(x).split(','))
    )
    print("Column 'keyword_count' has been added successfully.")
else:
    print("Column 'keywords' does not exist in the dataset.")

Column 'keyword_count' has been added successfully.


In [55]:
# Display the first few rows to verify
print(df[['keywords', 'keyword_count']].head())

            keywords  keyword_count
0  No keywords found              0
1  No keywords found              0
2  No keywords found              0
3  No keywords found              0
4  No keywords found              0


In [57]:
# 9. Check for duplicates in 'Timestamp' column, and remove the duplicate rows
if 'Timestamp' in df.columns:
    duplicate_timestamps = df[df['Timestamp'].duplicated(keep=False)]
    
    if duplicate_timestamps.empty:
        print("No duplicate values found in the 'Timestamp' column.")
    else:
        print(f"Number of duplicate 'Timestamp' values: {duplicate_timestamps.shape[0]}")
        print(duplicate_timestamps[['Timestamp']])
else:
    print("Column 'Timestamp' does not exist in the dataset.")

Number of duplicate 'Timestamp' values: 461
            Timestamp
120   15/01/2019 1:50
121   15/01/2019 1:50
122   15/01/2019 2:08
123   15/01/2019 2:08
124   16/01/2019 0:03
..                ...
850   5/05/2024 16:22
851   5/05/2024 16:22
886  23/09/2024 21:55
887  23/09/2024 21:55
888  23/09/2024 21:55

[461 rows x 1 columns]


In [59]:
# Remove duplicates based on 'Timestamp'
if 'Timestamp' in df.columns:
    initial_row_count = df.shape[0]
    df = df.drop_duplicates(subset='Timestamp', keep='first')
    final_row_count = df.shape[0]
    print(f"Removed {initial_row_count - final_row_count} duplicate rows based on 'Timestamp'.")
else:
    print("Column 'Timestamp' does not exist in the dataset.")

Removed 244 duplicate rows based on 'Timestamp'.


In [61]:
# 10. Save the cleaned dataset to csv
df.to_csv('dataset_corrections_1227_cleaned.csv', index=False)