Update CBAD.ipynb

Jeremy Perez · Jeremy Perez · commit 238a80487122 · 2019-07-31T17:26:27.000-06:00
diff --git a/CBAD.ipynb b/CBAD.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,9 +153,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "IndentationError",
+     "evalue": "unindent does not match any outer indentation level (<tokenize>, line 47)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"<tokenize>\"\u001b[0;36m, line \u001b[0;32m47\u001b[0m\n\u001b[0;31m    for rows in dataSet: #Getting features index with missing values\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
+     ]
+    }
+   ],
    "source": [
     "def gettingVariables(dataSet,dataSetOption):\n",
     "   \n",
@@ -202,6 +211,11 @@
     "        #############################################################################\n",
     "        #GETTING VARIABLES\n",
     "        #############################################################################\n",
+    "         missingValIndex = []\n",
+    "        for rows in dataSet: #Getting features index with missing values\n",
+    "            if dataSet[rows].isnull().sum() != 0:\n",
+    "                    missingValIndex.append(dataSet)\n",
+    "                    \n",
     "        X = dataSet.iloc[:,:-1].values#data\n",
     "        X = pd.DataFrame(X)\n",
     "        Y = dataSet.iloc[:,78].values#Labels\n",
@@ -303,7 +317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -391,7 +405,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -456,7 +470,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -496,7 +510,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -536,7 +550,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -597,7 +611,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -640,7 +654,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -681,7 +695,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -714,7 +728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -802,7 +816,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -850,7 +864,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -894,7 +908,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -925,7 +939,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -975,7 +989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1024,7 +1038,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1083,20 +1097,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 45,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def lofF1(Z,Y,clusters,maxVal):\n",
     "    from sklearn.metrics import f1_score\n",
@@ -1142,134 +1145,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "**************************************************\n",
-      "DATA SET MENU\n",
-      "**************************************************\n",
-      "1.NSL-KDD\n",
-      "2.IDS 2017\n",
-      "Option:2\n",
-      "Path of the File:/Users/jeremyperez/GoogleDrive/University/Montana-REU/Dataset/CICIDS2017.csv\n",
-      "Dataset has feature names[y/n]:y\n",
-      "\n",
-      "\n",
-      "**************************************************\n",
-      "Data has missing values\n",
-      "**************************************************\n",
-      "Features with missing values: ['Flow Bytes/s', ' Flow Packets/s']\n",
-      "Total missing Values ->  2594\n",
-      "0.004740190678829842 %\n",
-      "\n",
-      "\n",
-      "**************************************************\n",
-      "Manage Missing Values \n",
-      "**************************************************\n",
-      "1.Eliminate Catg. w/ Missing Values\n",
-      "2.Impute 0 for Missing Values\n",
-      "3.Impute Mean for Missing Values\n",
-      "4.Impute Median for Missing Values\n",
-      "5.Impute Mode for Missing Values\n",
-      "6.MICE Method\n",
-      "Option:6\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Sucessfully Imputed Simple Imputer \n",
-      "#########################################################################\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Encoding Menu\n",
-      "#########################################################################\n",
-      "1.Binary true labels: normal = 0, abnormal = 1\n",
-      "2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5\n",
-      "Enter option :1\n",
-      "Scale data [y/n]:y\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Data has been successfully scaled.\n",
-      "#########################################################################\n",
-      "Shuffle data [y]/[n]:y\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Data has been successfully shuffled.\n",
-      "#########################################################################\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Algorithm Menu\n",
-      "#########################################################################\n",
-      "1.Kmeans\n",
-      "2.Dbscan\n",
-      "3.Isolation Forest\n",
-      "4.Local Factor Outlier\n",
-      "option:1\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "KMEANS ALGORITHM\n",
-      "#########################################################################\n",
-      "Number of clusters:5\n",
-      "Initialization method [k-means++,random]:random\n",
-      "\n",
-      "Clustering...\n",
-      "\n",
-      "\n",
-      "\n",
-      "Run Time -> --- 1.2159347534179688e-05 seconds ---\n",
-      "Data Successfully Clustered\n",
-      "#########################################################################\n",
-      "KMEANS RESULTS\n",
-      "\n",
-      "\n",
-      "Clusters ->  [0, 1, 2, 3, 4] \n",
-      "\n",
-      "Inertia ->  346221.14566593803\n",
-      "col_0      0      1      2       3       4\n",
-      "row_0                                     \n",
-      "0      80801  41779  59390  135897  122164\n",
-      "1       1104    541    840    1680    1631\n",
-      "2        994    504    730    1715    1556\n",
-      "3      42586  21931  31022   71063   64471\n",
-      "4       1915    984   1275    3232    2887\n",
-      "5          1      2      4       1       3 \n",
-      "\n",
-      "\n",
-      "Max True Label \n",
-      "\n",
-      " col_0\n",
-      "0    0\n",
-      "1    0\n",
-      "2    0\n",
-      "3    0\n",
-      "4    0\n",
-      "dtype: int64\n",
-      "#########################################################################\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Kmeans Score Metrics Menu\n",
-      "#########################################################################\n",
-      "1.F1 Score\n",
-      "2.Normalized Mutual Info Score\n",
-      "3.Adjusted Rand Score\n",
-      "option:1\n",
-      "Average Method[weighted,micro,macro,binary]:micro\n",
-      "\n",
-      "\n",
-      "#########################################################################\n",
-      "Cluster Matchings by Maximun Intersection[Found: True] ->  {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}\n",
-      "KMEANS F1 Score ->  0.6352376126565065\n",
-      "#########################################################################\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "clear()\n",
     "##########################################################################\n",