# Data Understanding

In [1]:
# Import Libraries
import pandas as pd

In [2]:
df = pd.read_csv("data/guardian_environment_news.csv")
df.head()

Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,Tory leadership candidate criticised by campai...,"['Rob Davies', '@ByRobDavies']",Liz Truss will sign off on a push for more oil...,2022-08-30
1,Renewed Highland golf course plan has environm...,Scottish government rejected a new links at Co...,"['Ewan Murray', '@mrewanmurray']",It is an area so tranquil that the notion of b...,2021-03-22
2,Visiting green spaces deters mental health dr...,Positive effects were stronger among those rep...,"['Damien Gayle', '@damiengayle']","Visits to parks, community gardens and other u...",2023-01-17
3,Bought too much red cabbage? Turn it into a fe...,This fantastic vegan centrepiece makes full us...,['Tom Hunt'],"I devised today’s nut roast for Oddbox, a veg ...",2023-12-22
4,‘This year has been very good’: readers’ UK bu...,Readers share their favourite sightings over t...,['Guardian readers'],‘Constant companions to our gardening’A peacoc...,2023-12-19


In [3]:
# Get the total number of columns and data
columns = df.columns.tolist()
rows = len(df)
shape = df.shape

print(f"Columns: {columns}")
print(f"Total Rows: {rows}")
print(f"Shape of the dataset: {shape}")

Columns: ['Title', 'Intro Text', 'Authors', 'Article Text', 'Date Published']
Total Rows: 30059
Shape of the dataset: (30059, 5)


In [4]:
# Get the basic information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30059 entries, 0 to 30058
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           29111 non-null  object
 1   Intro Text      29977 non-null  object
 2   Authors         25489 non-null  object
 3   Article Text    29691 non-null  object
 4   Date Published  27618 non-null  object
dtypes: object(5)
memory usage: 1.1+ MB


In [5]:
# Get the total number of the null value for each column
for x in df.columns:
    null_value = df[x].isnull().sum()
    print(f"Null value of {x}: {null_value}")

Null value of Title: 948
Null value of Intro Text: 82
Null value of Authors: 4570
Null value of Article Text: 368
Null value of Date Published: 2441


In [6]:
# Print the rows with missing values
def check_missing_values(df):
    missing_rows = df[df.isnull().any(axis=1)]
    if missing_rows.empty:
        print("No missing values found in the dataset.")
    else:
        print("Rows with Missing Values:")
        print(missing_rows)

check_missing_values(df)

Rows with Missing Values:
                                                   Title  \
9      The bear truth: why happy pandas can’t be both...   
10      Shell chief vows to bolster emissions strateg...   
14      Here in British Columbia, we have spent the s...   
15      This article is more than 5 months oldSunak’s...   
18     The Guardian view on switching off: in an alwa...   
...                                                  ...   
30040   Of course I’m hot – for the return of wrestli...   
30044   When our planet is under attack we have to st...   
30046                      ObituarySimon Pepper obituary   
30048   Our laws make slaves of nature. It’s not just...   
30057                                                NaN   

                                              Intro Text  \
9      Researchers now think pandas’ notoriously low ...   
10      Ben van Beurden pledges to ‘rise to challenge...   
14     Blazes are destroying whole communities. The C...   
15     Consen

In [7]:
duplicated_data = df.duplicated().sum()
print(f"Total duplicated data: {duplicated_data}")

Total duplicated data: 0


In [8]:
# Print the rows with duplicate values
def check_duplicate_values(df):
    duplicated_rows = df[df.duplicated()]
    if duplicated_rows.empty:
        print("No duplicated values found in the dataset.")
    else:
        print("Rows with Duplicated Values:")
        print(duplicated_rows)

check_duplicate_values(df)

No duplicated values found in the dataset.


In [9]:
# Get descriptive statistics for the text length and word count
# Fill the missing value to 0
df["Article Text"] = df["Article Text"].fillna("").astype(str)

df["Text_Length"] = df["Article Text"].apply(len)
df["Word_Count"] = df["Article Text"].apply(lambda x: len(x.split()))
print(df[["Text_Length", "Word_Count"]].describe())

        Text_Length    Word_Count
count  30059.000000  30059.000000
mean    4862.713663    784.057986
std     3150.041167    513.307735
min        0.000000      0.000000
25%     2889.000000    465.000000
50%     4369.000000    698.000000
75%     5975.000000    958.000000
max    50264.000000   7882.000000


In [10]:
total_title = df["Title"].nunique()
unique_title = df["Title"].unique()

print(f"Total unique title: {total_title}")
print(f"Sample unique title: {unique_title}")

Total unique title: 29109
Sample unique title: [' Liz Truss ‘will approve more oil drilling if she becomes PM’'
 "Renewed Highland golf course plan has environmentalists crying 'Fore!'"
 ' Visiting green spaces deters mental health drug use, researchers find'
 ... ' Republican lawmaker pitches carbon tax in defiance of party stance'
 ' MPs call for urgent investigation into $180m in water buybacks'
 ' Climate crisis must not be overshadowed by Covid, Johnson to tell UN']
