In [1]:
# Import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Concatenate Original Datasets

In [2]:
# Read the data from the CSV file
data1 = pd.read_csv("Data/socialinsider_events_2024-05.csv")
data2 = pd.read_csv("Data/socialinsider_events_2024-06.csv")
data3 = pd.read_csv("Data/socialinsider_events_2024-07.csv")
data4 = pd.read_csv("Data/socialinsider_events_2024-08.csv")
data5 = pd.read_csv("Data/socialinsider_events_2024-09.csv")

In [3]:
# Concatenate the data
data = pd.concat([data1, data2, data3, data4, data5])

# Sort the data by time_created
data = data.sort_values(by="time_created")

In [4]:
data

Unnamed: 0,event_name,user_id,time_created,user_type,time_zone,country,view,platform,report_type,load_time
237525,profile load success,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T20:59:17.702Z,trial,Europe/London,Britain (UK),profile,ig,,1.264
237524,section change,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:06.728Z,trial,Europe/London,Britain (UK),profile,ig,,
237523,compare,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:06.765Z,trial,Europe/London,Britain (UK),profile,instagram,,
237522,section change,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:20.243Z,trial,Europe/London,Britain (UK),profile,ig,,
237521,profile visit,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:27.275Z,trial,Europe/London,Britain (UK),profile,tw,,
...,...,...,...,...,...,...,...,...,...,...
4,i10e_new_project,5ec95a7548babd264a6572cbec3d2e92d47bc26945552a...,2024-09-17T15:40:02.040Z,trial,America/New_York,United States,projecthome,xch,,
3,profile visit,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:23.029Z,trial,Asia/Manila,Philippines,profile,fb,,
2,profile load success,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:24.227Z,trial,Asia/Manila,Philippines,profile,fb,,1.732
1,section change,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:24.836Z,trial,Asia/Manila,Philippines,profile,fb,,


### Create a column to determine whether the event is from converted users

In [5]:
# Create a column that show whether the event is successful or not
buy_users = data[
    (data["event_name"] == "New Client") | (data["event_name"] == "Buy Success")
]["user_id"].tolist()
# 1 if the user is in buy_users, 0 otherwise
data["successful"] = data["user_id"].apply(lambda x: 1 if x in buy_users else 0)
data

Unnamed: 0,event_name,user_id,time_created,user_type,time_zone,country,view,platform,report_type,load_time,successful
237525,profile load success,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T20:59:17.702Z,trial,Europe/London,Britain (UK),profile,ig,,1.264,0
237524,section change,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:06.728Z,trial,Europe/London,Britain (UK),profile,ig,,,0
237523,compare,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:06.765Z,trial,Europe/London,Britain (UK),profile,instagram,,,0
237522,section change,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:20.243Z,trial,Europe/London,Britain (UK),profile,ig,,,0
237521,profile visit,38bd4c351e290c6024d39d44b9a2750ea666eefc9b1069...,2024-04-30T21:00:27.275Z,trial,Europe/London,Britain (UK),profile,tw,,,0
...,...,...,...,...,...,...,...,...,...,...,...
4,i10e_new_project,5ec95a7548babd264a6572cbec3d2e92d47bc26945552a...,2024-09-17T15:40:02.040Z,trial,America/New_York,United States,projecthome,xch,,,0
3,profile visit,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:23.029Z,trial,Asia/Manila,Philippines,profile,fb,,,0
2,profile load success,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:24.227Z,trial,Asia/Manila,Philippines,profile,fb,,1.732,0
1,section change,74f7758e77ac3a7ed66af7e7226e2aad60f18310244cb7...,2024-09-17T15:40:24.836Z,trial,Asia/Manila,Philippines,profile,fb,,,0


### Convert the timestamp

In [6]:
# create the date that the event was created
data["Date_Created"] = pd.to_datetime(data["time_created"]).dt.date
# create the hour that the event was created
data["Hour_Created"] = pd.to_datetime(data["time_created"]).dt.hour
# create the day of the week that the event was created
data["Day_Of_Week_Created"] = pd.to_datetime(data["time_created"]).dt.day_name()

### Check for Missing Values

In [7]:
data["load_time"].isnull().sum()

830333

### Create Buy Success Event and Unsuccessful Event

In [8]:
buy_users = data[
    (data["event_name"] == "New Client") | (data["event_name"] == "Buy Success")
]["user_id"].tolist()
buy_users_events = data[data["user_id"].isin(buy_users)]
# create a dataset with only the unsuccessful events
not_buy_users_events = data[data["successful"] == 0]

### View in buy success event

In [9]:
# calculate the percentage of views in both successful and not successful events
print(
    "buy success event",
    buy_users_events["view"].value_counts() / len(buy_users_events) * 100,
)
print(
    "not buy success event",
    not_buy_users_events["view"].value_counts() / len(not_buy_users_events) * 100,
)

buy success event view
profile        20.923334
benchmark      20.617410
projecthome    10.559006
brands          6.044313
postsfeed       2.197089
campaigns       1.538889
settings        1.075368
upgradeplan     0.862149
reports         0.723093
ads             0.157597
add             0.120515
hashtag         0.111245
addprofiles     0.055623
proj            0.018541
bench           0.018541
connect         0.009270
Name: count, dtype: float64
not buy success event view
profile        38.787263
benchmark       9.105269
projecthome     7.112417
postsfeed       2.894668
brands          2.620945
campaigns       2.235629
reports         0.714368
upgradeplan     0.636325
hashtag         0.500766
settings        0.184531
bench           0.029175
add             0.027820
proj            0.013233
ads             0.008440
addprofiles     0.007294
search          0.003959
connect         0.001771
page            0.000104
Name: count, dtype: float64


# User Data Transformation

#### Success, event count, and country

In [10]:
user_data = (
    data.groupby("user_id")
    .agg(
        {
            "event_name": "count",  # Count events
            "country": "first",  # Take the first occurrence of country (assuming it's the same for each user)
        }
    )
    .reset_index()
    .rename(columns={"event_name": "event_count"})  # Rename for clarity
    .sort_values(by="event_count", ascending=False)  # Sort by event_count
)

In [11]:
# Convert counties to separate columns if they are in certain countries
user_data["country_United_States"] = user_data["country"].apply(
    lambda x: 1 if x == "United States" else 0
)
user_data["country_Saudi_Arabia"] = user_data["country"].apply(
    lambda x: 1 if x == "Saudi Arabia" else 0
)
user_data["country_India"] = user_data["country"].apply(
    lambda x: 1 if x == "India" else 0
)
user_data["country_Britain"] = user_data["country"].apply(
    lambda x: 1 if x == "Britain (UK)" else 0
)
user_data["country_Italy"] = user_data["country"].apply(
    lambda x: 1 if x == "Italy" else 0
)

In [12]:
# Add the column to show whether the user is successful or not
user_data["successful"] = user_data["user_id"].apply(
    lambda x: 1 if x in buy_users else 0
)
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful
8543,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0
3713,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0
4081,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0
10483,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0
3857,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
5897,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0
10147,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0
10467,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0
1822,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0


#### Transform Load Time

In [13]:
# get the average load time for each client and sort by the average load time
average_load_time = data.groupby("user_id")["load_time"].mean().sort_values()

In [14]:
# add the column to show the average load time for each user
user_data["average_load_time"] = user_data["user_id"].apply(
    lambda x: average_load_time[x] if x in average_load_time else None
)
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful,average_load_time
8543,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0,1.238146
3713,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0,1.040046
4081,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0,1.823422
10483,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0,1.692242
3857,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0,1.908071
...,...,...,...,...,...,...,...,...,...,...
5897,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0,
10147,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0,
10467,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0,
1822,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0,


In [15]:
# get the maximum load time for each client and sort by the maximum load time
max_load_time = data.groupby("user_id")["load_time"].max().sort_values()

In [16]:
# add the column to show the maximum load time for each user
user_data["max_load_time"] = user_data["user_id"].apply(
    lambda x: max_load_time[x] if x in max_load_time else None
)

#### Converted Event

In [17]:
event_counts = (
    data.groupby("user_id")
    .apply(
        lambda x: pd.Series(
            {
                "event_bench_load_success_count": (
                    x["event_name"] == "bench load success"
                ).sum(),
                "event_profile_search_success_count": (
                    x["event_name"] == "profile search success"
                ).sum(),
                "event_add_profile_success_count": (
                    x["event_name"] == "add profile success"
                ).sum(),
                "event_upgrade_plan_count": (x["event_name"] == "upgrade_plan").sum(),
                "event_pricing_model_count": (
                    x["event_name"] == "pricing modal visited"
                ).sum(),
                "event_profile_load_fail_count": (
                    x["event_name"] == "profile load fail"
                ).sum(),
                "event_email_receipt_count": (x["event_name"] == "email receipt").sum(),
            }
        )
    )
    .reset_index()
)

  .apply(


In [18]:
# join the event_counts dataframe on user_data using user_id
user_data = user_data.merge(event_counts, on="user_id", how="left")

In [19]:
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful,average_load_time,max_load_time,event_bench_load_success_count,event_profile_search_success_count,event_add_profile_success_count,event_upgrade_plan_count,event_pricing_model_count,event_profile_load_fail_count,event_email_receipt_count
0,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0,1.238146,2.277,0,2,1,3,3,3450,1
1,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0,1.040046,10.068,1,2,2,1,1,0,4
2,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0,1.823422,64.181,0,1,3,4,4,1849,1
3,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0,1.692242,5.232,2,2,3,1,1,1,12
4,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0,1.908071,5.204,9,35,20,0,0,1572,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10603,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0,,,0,0,0,0,0,0,0
10604,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0,,,0,0,0,0,0,0,0
10605,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0,,,0,0,0,0,0,0,0
10606,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0,,,0,0,0,0,0,0,0


### Transform Platform

In [20]:
# convert to shortcut
data["platform"] = data["platform"].apply(
    lambda x: (
        "fb"
        if x == "facebook" or x == "showFacebook"
        else (
            "tw"
            if x == "twitter"
            else (
                "ig"
                if x == "instagram"
                else (
                    "yt"
                    if x == "youtube"
                    else (
                        "li"
                        if x == "linkedin"
                        else (
                            "tk"
                            if x == "tiktok"
                            else "xch" if x == "cross-platform" else x
                        )
                    )
                )
            )
        )
    )
)

# rename to platform + original name + count
data["platform"] = data["platform"].apply(lambda x: f"platform_{x}_count")

# count
platform_data = (
    data.groupby(["user_id", "platform"]).size().unstack(fill_value=0).reset_index()
)

# merge
user_data = user_data.merge(platform_data, on="user_id", how="left")

In [21]:
# print count of each platform
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful,average_load_time,...,platform_fb_count,platform_hashtags_count,platform_ig_count,platform_li_count,platform_meta_count,platform_nan_count,platform_tk_count,platform_tw_count,platform_xch_count,platform_yt_count
0,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0,1.238146,...,0,0,6947,0,0,4,0,0,22,0
1,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0,1.040046,...,0,0,0,0,0,6,4684,0,10,0
2,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0,1.823422,...,64,0,3829,0,2,4,0,0,26,0
3,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0,1.692242,...,0,0,79,0,0,18,3370,0,75,0
4,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0,1.908071,...,0,0,27,0,0,5,3222,40,93,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10603,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0,,...,0,0,0,0,0,1,0,0,0,0
10604,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0,,...,0,0,0,0,0,1,0,0,0,0
10605,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0,,...,0,0,0,0,0,1,0,0,0,0
10606,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0,,...,0,0,0,0,0,2,0,0,0,0


In [22]:
# add a column in user_data for total platforms of each user
user_data["platform_total_count"] = user_data.iloc[:, 5:].sum(axis=1)
print(user_data["platform_total_count"].value_counts())

platform_total_count
1.000000      414
2.000000      405
4.000000      278
3.000000      260
5.000000      224
             ... 
148.727273      1
136.054688      1
138.665833      1
138.801647      1
103.599000      1
Name: count, Length: 6900, dtype: int64


### Transform View

In [23]:
# rename
data["view"] = data["view"].apply(lambda x: f"view_{x}")

# convert
view_data = data.groupby(["user_id", "view"]).size().unstack(fill_value=0).reset_index()
view_data

view,user_id,view_add,view_addprofiles,view_ads,view_bench,view_benchmark,view_brands,view_campaigns,view_connect,view_hashtag,view_nan,view_page,view_postsfeed,view_profile,view_proj,view_projecthome,view_reports,view_search,view_settings,view_upgradeplan
0,00060bf870ad790482c39c36da5372f44d56426d327c36...,0,0,0,0,0,0,0,0,0,123,0,50,0,0,29,0,0,0,0
1,0007e623b14ceefae2ea443c21ec2f35e7b1e588f74641...,0,0,0,0,0,0,0,0,0,13,0,0,14,0,0,2,0,0,0
2,00195548c17065bb6ae71b927c39e105d221c970ecf15e...,0,0,0,0,0,0,0,0,8,9,0,0,0,0,3,0,0,0,0
3,0019747558d686ee27c60a39345e2bb09792e44367d45b...,0,0,0,0,3,0,4,0,7,16,0,3,32,0,5,3,0,0,0
4,001f1cc74d3b9a713ea45e46700735bb4d1d0c062244f8...,0,0,0,0,0,0,0,0,0,34,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10603,ffd02b0dd86934f9a0bb95920521a59c64dd209995fbcf...,0,0,0,0,0,0,0,0,0,13,0,0,20,0,1,0,0,0,0
10604,ffd2841661e7cf023a5d5956c68b2b0fbb1b8d27c32fb0...,0,0,0,0,0,0,7,0,0,20,0,0,1,0,0,1,0,0,0
10605,ffd9bf8af2ff5be89bafbc0f81601f0b7885bf7e936996...,0,0,0,0,17,0,5,0,0,29,0,13,71,0,9,1,0,0,0
10606,ffe4b53d23689200a4b17c18a74fed6b4200b0584cc908...,0,0,0,0,35,6,4,0,0,11,0,19,15,0,14,2,0,0,0


In [24]:
# merge
user_data = user_data.merge(view_data, on="user_id", how="left")

In [25]:
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful,average_load_time,...,view_nan,view_page,view_postsfeed,view_profile,view_proj,view_projecthome,view_reports,view_search,view_settings,view_upgradeplan
0,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0,1.238146,...,16,0,8,6945,0,2,0,0,0,2
1,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0,1.040046,...,19,0,0,4674,0,3,0,0,0,0
2,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0,1.823422,...,22,0,0,3889,0,11,0,0,0,3
3,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0,1.692242,...,46,0,8,3430,0,38,3,0,0,0
4,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0,1.908071,...,126,0,0,3216,0,14,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10603,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10604,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10605,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10606,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0,,...,2,0,0,0,0,0,0,0,0,0


In [26]:
# print unique columns in user_data
print(user_data.columns)
# print number of columns in user_data
print("Number of columns in user_data:", len(user_data.columns))

Index(['user_id', 'event_count', 'country', 'country_United_States',
       'country_Saudi_Arabia', 'country_India', 'country_Britain',
       'country_Italy', 'successful', 'average_load_time', 'max_load_time',
       'event_bench_load_success_count', 'event_profile_search_success_count',
       'event_add_profile_success_count', 'event_upgrade_plan_count',
       'event_pricing_model_count', 'event_profile_load_fail_count',
       'event_email_receipt_count', 'platform_all_count',
       'platform_brbench_count', 'platform_fb_count',
       'platform_hashtags_count', 'platform_ig_count', 'platform_li_count',
       'platform_meta_count', 'platform_nan_count', 'platform_tk_count',
       'platform_tw_count', 'platform_xch_count', 'platform_yt_count',
       'platform_total_count', 'view_add', 'view_addprofiles', 'view_ads',
       'view_bench', 'view_benchmark', 'view_brands', 'view_campaigns',
       'view_connect', 'view_hashtag', 'view_nan', 'view_page',
       'view_postsfeed', 'v

## Export Data as CSV

In [27]:
# # export user_data to csv
# user_data.to_csv("user_data.csv", index=False)

In [28]:
user_data

Unnamed: 0,user_id,event_count,country,country_United_States,country_Saudi_Arabia,country_India,country_Britain,country_Italy,successful,average_load_time,...,view_nan,view_page,view_postsfeed,view_profile,view_proj,view_projecthome,view_reports,view_search,view_settings,view_upgradeplan
0,cd6d41c28b017733e39fc13e4d4ae3b308c6baff52561d...,6973,United States,1,0,0,0,0,0,1.238146,...,16,0,8,6945,0,2,0,0,0,2
1,5929c3eec5a2b12ee4aa911b8093b33ac7b798d59e3b14...,4700,Indonesia,0,0,0,0,0,0,1.040046,...,19,0,0,4674,0,3,0,0,0,0
2,622748abdc404cbf098568074c232b04762d574fa48130...,3925,Ukraine,0,0,0,0,0,0,1.823422,...,22,0,0,3889,0,11,0,0,0,3
3,fd15147418cba03da1908de36ac79bc24589c9d1474716...,3538,Thailand,0,0,0,0,0,0,1.692242,...,46,0,8,3430,0,38,3,0,0,0
4,5c54eb7d3e4821d8e633e7ef23dacafe3f5a6530715510...,3387,Germany,0,0,0,0,0,0,1.908071,...,126,0,0,3216,0,14,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10603,8f14ccc4db353eaefaab6ac8efc4a3b68b2a8731a3b35a...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10604,f4f9987e642659097752e755b740f7e487f6c252f5be0c...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10605,fcc49dce067e042b96df3e56d2934e551fa4b45d0dcdb0...,0,,0,0,0,0,0,0,,...,1,0,0,0,0,0,0,0,0,0
10606,2a6612b86399adeeb92d2b44603d8736b1dbfcb84ccf24...,0,,0,0,0,0,0,0,,...,2,0,0,0,0,0,0,0,0,0


## Train Test Split

In [29]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame `df` with features `X` and a target column `y`
X = user_data.drop("successful", axis=1)
y = user_data["successful"]

# Perform train-test split, with 80% of the data used for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Now you have your train and test datasets ready
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8486, 49) (2122, 49) (8486,) (2122,)
