In [1]:
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot') # Look Pretty
#%matplotlib inline

#
# INFO: This dataset has call records for 10 users tracked over the course of 3 years.
# Your job is to find out where the users likely live at!

In [2]:
def showandtell(title=None):
  if title != None: plt.savefig(title + ".png", bbox_inches='tight', dpi=300)
  plt.show()
  #exit()

def clusterInfo(model):
  print "Cluster Analysis Inertia: ", model.inertia_
  print '------------------------------------------'
  for i in range(len(model.cluster_centers_)):
    print "\n  Cluster ", i
    print "    Centroid ", model.cluster_centers_[i]
    print "    #Samples ", (model.labels_==i).sum() # NumPy Power

# Find the cluster with the least # attached nodes
def clusterWithFewestSamples(model):
  # Ensure there's at least on cluster...
  minSamples = len(model.labels_)
  minCluster = 0
  for i in range(len(model.cluster_centers_)):
    if minSamples > (model.labels_==i).sum():
      minCluster = i
      minSamples = (model.labels_==i).sum()
  print "\n  Cluster With Fewest Samples: ", minCluster
  return (model.labels_==minCluster)

In [3]:
def doKMeans(data, clusters=0):
    #
    # TODO: Be sure to only feed in Lat and Lon coordinates to the KMeans algo, since none of the other
    # data is suitable for your purposes. Since both Lat and Lon are (approximately) on the same scale,
    # no feature scaling is required. Print out the centroid locations and add them onto your scatter
    # plot. Use a distinguishable marker and color.
    #
    # Hint: Make sure you fit ONLY the coordinates, and in the CORRECT order (lat first).
    # This is part of your domain expertise.
    #
    # .. your code here ..
    from sklearn.cluster import KMeans
    model = KMeans(n_clusters=clusters)
    model.fit(data)
    return model

In [4]:
#
# TODO: Load up the dataset and take a peek at its head and dtypes.
# Convert the date using pd.to_datetime, and the time using pd.to_timedelta
#
# .. your code here ..
datafile = './Datasets/CDR.csv'
df = pd.read_csv(datafile, header=0)
df.head()

Unnamed: 0,In,Out,Direction,CallDate,CallTime,DOW,Duration,TowerID,TowerLat,TowerLon
0,4638472273,2666307251,Incoming,2010-12-25,07:16:24.736813,Sat,0:02:41.741499,0db53dd3-eb9c-4344-abc5-c2d74ebc3eec,32.731611,-96.709417
1,4638472273,1755442610,Incoming,2010-12-25,21:18:30.053710,Sat,0:02:47.108750,aeaf8b43-8034-44fe-833d-31854a75acbf,32.731722,-96.7095
2,4638472273,5481755331,Incoming,2010-12-25,14:52:42.878016,Sat,0:04:35.356341,fadaa83f-6001-45fd-aa4a-17d6c6b7ec00,32.899944,-96.910389
3,4638472273,1755442610,Incoming,2010-12-25,16:02:09.001913,Sat,0:02:23.498499,fadaa83f-6001-45fd-aa4a-17d6c6b7ec00,32.899944,-96.910389
4,4638472273,2145623768,Incoming,2010-12-25,15:28:35.028554,Sat,0:03:54.692497,95d7920d-c3cd-4d20-a568-9a55800dc807,32.899944,-96.910389


In [5]:
df.dtypes

In             int64
Out            int64
Direction     object
CallDate      object
CallTime      object
DOW           object
Duration      object
TowerID       object
TowerLat     float64
TowerLon     float64
dtype: object

In [6]:
df.CallDate = pd.to_datetime(df.CallDate, errors='coerce')
df.CallTime = pd.to_timedelta(df.CallTime, errors='coerce')

In [7]:
#
# TODO: Get a distinct list of "In" phone numbers (users) and store the values in a
# regular python list.
# Hint: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tolist.html
#
# .. your code here ..
In_list = df.In.unique().tolist()
print In_list

[4638472273, 1559410755, 4931532174, 2419930464, 1884182865, 3688089071, 4555003213, 2068627935, 2894365987, 8549533077]


In [8]:
#
# INFO: The locations map above should be too "busy" to really wrap your head around. This
# is where domain expertise comes into play. Your intuition tells you that people are likely
# to behave differently on weekends:
#
# On Weekdays:
#   1. People probably don't go into work
#   2. They probably sleep in late on Saturday
#   3. They probably run a bunch of random errands, since they couldn't during the week
#   4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
#   1. People probably are at work during normal working hours
#   2. They probably are at home in the early morning and during the late night
#   3. They probably spend time commuting between work and home everyday

In [9]:
print "\n\nExamining person: ", 0
# 
# TODO: Create a slice called user1 that filters to only include dataset records where the
# "In" feature (user phone number) is equal to the first number on your unique list above
#
# .. your code here ..
user1 = df[df.In==In_list[1]]
user1



Examining person:  0


Unnamed: 0,In,Out,Direction,CallDate,CallTime,DOW,Duration,TowerID,TowerLat,TowerLon
7,1559410755,6092528894,Missed,2010-12-25,15:15:56.502972,Sat,0:11:52.952187,b4319acf-b475-4c3e-a2e0-03b2dd2daf9e,32.696722,-96.934306
8,1559410755,6092528894,Incoming,2010-12-25,20:15:19.667734,Sat,0:11:52.951080,f958754c-3d55-47c4-8236-50b964a7b997,32.870972,-96.923556
9,1559410755,8125446700,Missed,2010-12-25,10:01:02.162977,Sat,0:14:11.046844,07dec2d7-b5d1-410d-8879-ecf7385af719,32.696083,-96.934333
10,1559410755,5452154622,Incoming,2010-12-25,19:57:00.050868,Sat,0:14:40.319677,f958754c-3d55-47c4-8236-50b964a7b997,32.870972,-96.923556
11,1559410755,1747156055,Incoming,2010-12-25,10:08:39.978468,Sat,0:12:06.826578,b4319acf-b475-4c3e-a2e0-03b2dd2daf9e,32.696722,-96.934306
12,1559410755,8484043137,Incoming,2010-12-25,20:57:01.994902,Sat,0:14:10.485012,1cffcc44-1c60-4ca2-9a37-d3862bbe9702,32.870972,-96.923556
13,1559410755,4182819078,Incoming,2010-12-25,17:35:08.626059,Sat,0:14:14.315459,d4e84085-292b-4fcb-bdc1-9813ca24e696,32.696083,-96.934333
14,1559410755,8484043137,Incoming,2010-12-25,09:07:13.394117,Sat,0:13:46.403655,b4319acf-b475-4c3e-a2e0-03b2dd2daf9e,32.696722,-96.934306
15,1559410755,6448594093,Missed,2010-12-25,15:07:17.355913,Sat,0:10:20.644213,73900a57-7f0f-46e8-a30b-aa84828eee40,32.695556,-96.938333
16,1559410755,7137960738,Incoming,2010-12-25,19:18:03.939869,Sat,0:10:42.917630,8b54cc68-bb4d-4baf-b824-4cd1ba67bfb0,32.870972,-96.923556


In [10]:
#
# TODO: Alter your slice so that it includes only Weekday (Mon-Fri) values.
#
# .. your code here ..
user1 = user1[(user1.DOW!='Sat') & (user1.DOW!='Sun')]
user1

Unnamed: 0,In,Out,Direction,CallDate,CallTime,DOW,Duration,TowerID,TowerLat,TowerLon
162,1559410755,6092528894,Incoming,2010-12-27,13:58:15.173497,Mon,0:02:39.474541,78dc86cb-a8f2-4da8-b7c2-ff09e6e73579,32.696083,-96.934333
163,1559410755,5510742109,Incoming,2010-12-27,19:26:17.851404,Mon,0:03:50.854513,8b54cc68-bb4d-4baf-b824-4cd1ba67bfb0,32.870972,-96.923556
164,1559410755,8484043137,Missed,2010-12-27,06:46:48.804762,Mon,0:05:41.296615,f958754c-3d55-47c4-8236-50b964a7b997,32.870972,-96.923556
165,1559410755,4091221367,Incoming,2010-12-27,19:29:34.024910,Mon,0:04:18.209604,3adc057c-4ec8-44b7-bf6b-7b1c4fe9a99a,32.870972,-96.923556
166,1559410755,1747156055,Incoming,2010-12-27,17:39:55.428982,Mon,0:05:55.991975,8a3dca51-1760-4113-b7d8-d515ec6a8407,32.695556,-96.938333
167,1559410755,5452154622,Incoming,2010-12-27,07:46:33.266193,Mon,0:06:17.798725,6672464e-9093-499b-947f-322a6e4441fb,32.696083,-96.934333
168,1559410755,1747156055,Incoming,2010-12-27,06:55:06.699239,Mon,0:04:06.506535,af8c4c52-6d95-4892-86c2-ee80c4831256,32.870972,-96.923556
169,1559410755,7818412902,Incoming,2010-12-27,06:09:32.957209,Mon,0:05:41.989601,8b54cc68-bb4d-4baf-b824-4cd1ba67bfb0,32.870972,-96.923556
170,1559410755,1146268309,Missed,2010-12-27,07:00:13.029576,Mon,0:04:50.811054,245134b4-96ca-455e-a5dc-2b5777c00d62,32.870944,-96.923528
171,1559410755,7823046796,Missed,2010-12-27,19:49:57.752700,Mon,0:06:38.861485,267195f3-2915-4333-b411-702baead1225,32.870944,-96.923528


In [11]:
#
# TODO: The idea is that the call was placed before 5pm. From Midnight-730a, the user is
# probably sleeping and won't call / wake up to take a call. There should be a brief time
# in the morning during their commute to work, then they'll spend the entire day at work.
# So the assumption is that most of the time is spent either at work, or in 2nd, at home.
#
# .. your code here ..
user1 = user1[(user1.CallTime<"17:00:00") & (user1.CallTime>"07:30:00")]
user1

Unnamed: 0,In,Out,Direction,CallDate,CallTime,DOW,Duration,TowerID,TowerLat,TowerLon
162,1559410755,6092528894,Incoming,2010-12-27,13:58:15.173497,Mon,0:02:39.474541,78dc86cb-a8f2-4da8-b7c2-ff09e6e73579,32.696083,-96.934333
167,1559410755,5452154622,Incoming,2010-12-27,07:46:33.266193,Mon,0:06:17.798725,6672464e-9093-499b-947f-322a6e4441fb,32.696083,-96.934333
206,1559410755,7823046796,Incoming,2010-12-28,07:58:49.648882,Tue,0:09:39.243985,ae0cc295-3666-4d33-984b-b2c089b857a2,32.724722,-96.915000
208,1559410755,5510742109,Incoming,2010-12-28,08:41:31.686402,Tue,0:09:42.904651,cb350a49-4b2c-4f7b-a607-20912c5cf344,32.695861,-96.937944
209,1559410755,8205293065,Incoming,2010-12-28,16:36:06.549773,Tue,0:08:46.672855,cb350a49-4b2c-4f7b-a607-20912c5cf344,32.695861,-96.937944
210,1559410755,4091221367,Missed,2010-12-28,10:07:51.337205,Tue,0:08:58.496012,d4e84085-292b-4fcb-bdc1-9813ca24e696,32.696083,-96.934333
211,1559410755,7818412902,Incoming,2010-12-28,09:24:17.942324,Tue,0:10:17.702303,149b01f7-fe73-41d9-af83-7496f959962c,32.695861,-96.937944
213,1559410755,6403295440,Incoming,2010-12-28,12:08:50.154966,Tue,0:08:43.469401,8ae142fc-9b22-4aab-9369-dd71bcf0ee9c,32.696083,-96.934333
214,1559410755,6403295440,Missed,2010-12-28,07:50:27.661177,Tue,0:07:24.578839,803d4a99-fdb3-48d2-8b8c-1860f1172cb6,32.724722,-96.915000
219,1559410755,1747156055,Missed,2010-12-28,08:13:59.582759,Tue,0:10:40.455480,78dc86cb-a8f2-4da8-b7c2-ff09e6e73579,32.696083,-96.934333


In [12]:
#
# TODO: Plot the Cell Towers the user connected to
#
# .. your code here ..
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)

<matplotlib.collections.PathCollection at 0x11665e190>

In [13]:
#
# INFO: Run K-Means with K=3 or K=4. There really should only be a two areas of concentration. If you
# notice multiple areas that are "hot" (multiple areas the usr spends a lot of time at that are FAR
# apart from one another), then increase K=5, with the goal being that all centroids except two will
# sweep up the annoying outliers and not-home, not-work travel occasions. the other two will zero in
# on the user's approximate home location and work locations. Or rather the location of the cell
# tower closest to them.....
model = doKMeans(user1[['TowerLat', 'TowerLon']], 4)

In [None]:
#
# INFO: Print out the mean CallTime value for the samples belonging to the cluster with the LEAST
# samples attached to it. If our logic is correct, the cluster with the MOST samples will be work.
# The cluster with the 2nd most samples will be home. And the K=3 cluster with the least samples
# should be somewhere in between the two. What time, on average, is the user in between home and
# work, between the midnight and 5pm?
midWayClusterIndices = clusterWithFewestSamples(model)
midWaySamples = user1[midWayClusterIndices]
print "    Its Waypoint Time: ", midWaySamples.CallTime.mean()


  Cluster With Fewest Samples:  3
    Its Waypoint Time:  0 days 08:22:21.479006


In [None]:
#
# Let's visualize the results!
# First draw the X's for the clusters:
ax.scatter(model.cluster_centers_[:,1], model.cluster_centers_[:,0], s=169, c='r', marker='x', alpha=0.8, linewidths=2)
print model.cluster_centers_
#
# Then save the results:
showandtell('Weekday Calls Centroids')  # Comment this line out when you're ready to proceed