In [1]:
import pandas as pd
import great_expectations as ge
from great_expectations.dataset import PandasDataset

In [7]:
# load raw dataset
raw_df = pd.read_csv("fraud_email_.csv")
raw_df.head(5)

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


# 3. Clean Data

In [8]:
# drop missing value
dropped_raw = raw_df.dropna(inplace=True)
print("dropped raw :", dropped_raw)

dropped raw : None


In [9]:
# drop duplicated
drop_duplicates = raw_df.drop_duplicates(inplace=True)
print("dropped duplicates :", drop_duplicates)

dropped duplicates : None


In [10]:
# remove whitespace
removed = raw_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
print("whitespace removed :", removed)

whitespace removed :                                                     Text  Class
0      Supply Quality China's EXCLUSIVE dimensions at...      1
1                             over. SidLet me know. Thx.      0
2      Dear Friend,Greetings to you.I wish to accost ...      1
3      MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....      1
4              Not a surprising assessment from Embassy.      0
...                                                  ...    ...
11923  I called and was transferred to room but got n...      0
11924  Travel well. I'll look forward to hearing your...      0
11926  Follow Up Flag: Follow upFlag Status: FlaggedM...      0
11927  sbwhoeop B6Saturday January 23 2010 4:09 PMRe:...      0
11928  FYI. We are revising call sheet for call to Ka...      0

[10249 rows x 2 columns]


In [11]:
# rename column
raw_df.columns = raw_df.columns.str.lower()

# 4 Save Cleaned Data

In [12]:
clean_df = raw_df.to_csv("fraud_email_clean.csv")

# 5 Validate the Data

In [14]:
# read cleaned data
df = pd.read_csv('fraud_email_clean.csv')
df

Unnamed: 0.1,Unnamed: 0,text,class
0,0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,1,over. SidLet me know. Thx.,0
2,2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,4,Not a surprising assessment from Embassy.,0
...,...,...,...
10244,11923,I called and was transferred to room but got n...,0
10245,11924,Travel well. I'll look forward to hearing your...,0
10246,11926,Follow Up Flag: Follow upFlag Status: FlaggedM...,0
10247,11927,sbwhoeop B6Saturday January 23 2010 4:09 PMRe:...,0


In [15]:
# Create a Great Expectations dataset
class CustomDataset(PandasDataset):
    # Optionally, define custom expectations or methods here
    pass

# Instantiate the Great Expectations dataset
ge_df = CustomDataset(df)

In [16]:
# convert the pandas dataframe to a great_expectations dataset
df_ge = ge.from_pandas(df)

In [17]:
# Expect the unique_id to be unique
result_unique = df_ge.expect_column_values_to_be_unique(column="text")
result_unique

{
  "success": true,
  "result": {
    "element_count": 10249,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
# Expect a column to be of a certain data type
result_column_type = df_ge.expect_column_most_common_value_to_be_in_set(column="class", value_set=[0, 1])
result_column_type

{
  "success": true,
  "result": {
    "observed_value": [
      0
    ],
    "element_count": 10249,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
# Expect a column to be of a certain length
result_val_length = df_ge.expect_column_value_lengths_to_be_between(column="text", min_value=5)
result_column_type

# using min value of 5, in emails at least 5 chars detected to say Hello, or Dear, 

{
  "success": true,
  "result": {
    "observed_value": [
      0
    ],
    "element_count": 10249,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}